fix: reduce duplicate entities from weak model inconsistency
Adds normalize_entity_type() which lowercases and canonicalises synonyms (location→place, human→person, etc.) before every upsert. The SQL lookup now uses lower(entity_type) on both sides so existing dirty rows (Person, Location) correctly deduplicate against normalised writes without a migration. Adds a pre-flight similarity check in tool_store_entity: before upserting, searches active entities of the same type using the first name token. Any non-exact matches are appended to the tool response so the agentic loop can choose to reuse an existing entity ID rather than create a duplicate. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1845,6 +1845,41 @@ Return ONLY the summary, nothing else."#,
|
||||
description
|
||||
);
|
||||
|
||||
// Pre-flight similarity check — surface near-duplicates to the model
|
||||
// before it commits to a new entity. Uses the first name token as the
|
||||
// search term so "Sarah" matches when storing "Sarah Johnson" and vice
|
||||
// versa. Exact-name matches are excluded (upsert_entity deduplicates
|
||||
// those already). Results are appended to the tool response so the
|
||||
// model can choose to use an existing entity's ID instead.
|
||||
let similar_entities: Vec<String> = {
|
||||
use crate::database::{EntityFilter, KnowledgeDao};
|
||||
use crate::database::knowledge_dao::normalize_entity_type;
|
||||
let normalised_type = normalize_entity_type(&entity_type);
|
||||
let first_token = name
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap_or(&name)
|
||||
.to_string();
|
||||
let filter = EntityFilter {
|
||||
entity_type: None, // search all types, filter client-side to avoid case issues
|
||||
status: Some("active".to_string()),
|
||||
search: Some(first_token),
|
||||
limit: 10,
|
||||
offset: 0,
|
||||
};
|
||||
let mut kdao = self.knowledge_dao.lock().expect("Unable to lock KnowledgeDao");
|
||||
kdao.list_entities(cx, filter)
|
||||
.unwrap_or_default()
|
||||
.0
|
||||
.into_iter()
|
||||
.filter(|e| {
|
||||
normalize_entity_type(&e.entity_type) == normalised_type
|
||||
&& e.name.to_lowercase() != name.to_lowercase()
|
||||
})
|
||||
.map(|e| format!(" ID:{} | {} | {}", e.id, e.name, e.description))
|
||||
.collect()
|
||||
};
|
||||
|
||||
// Generate embedding for name + description (best-effort)
|
||||
let embed_text = format!("{} {}", name, description);
|
||||
let embedding: Option<Vec<u8>> = match ollama.generate_embedding(&embed_text).await {
|
||||
@@ -1875,10 +1910,22 @@ Return ONLY the summary, nothing else."#,
|
||||
.lock()
|
||||
.expect("Unable to lock KnowledgeDao");
|
||||
match kdao.upsert_entity(cx, insert) {
|
||||
Ok(entity) => format!(
|
||||
"Entity stored: ID:{} | {} | {} | confidence:{:.2}",
|
||||
entity.id, entity.entity_type, entity.name, entity.confidence
|
||||
),
|
||||
Ok(entity) => {
|
||||
let mut response = format!(
|
||||
"Entity stored: ID:{} | {} | {} | confidence:{:.2}",
|
||||
entity.id, entity.entity_type, entity.name, entity.confidence
|
||||
);
|
||||
if !similar_entities.is_empty() {
|
||||
response.push_str(
|
||||
"\nSimilar existing entities found — verify this is not a duplicate:\n",
|
||||
);
|
||||
response.push_str(&similar_entities.join("\n"));
|
||||
response.push_str(
|
||||
"\nIf one of these is the same entity, use their existing ID in store_fact instead of the newly created one.",
|
||||
);
|
||||
}
|
||||
response
|
||||
}
|
||||
Err(e) => format!("Error storing entity: {:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user