fix: reduce duplicate entities from weak model inconsistency

Adds normalize_entity_type() which lowercases and canonicalises synonyms (location→place, human→person, etc.) before every upsert. The SQL lookup now uses lower(entity_type) on both sides so existing dirty rows (Person, Location) correctly deduplicate against normalised writes without a migration. Adds a pre-flight similarity check in tool_store_entity: before upserting, searches active entities of the same type using the first name token. Any non-exact matches are appended to the tool response so the agentic loop can choose to reuse an existing entity ID rather than create a duplicate. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 18:27:09 -04:00
parent bc3b313e2e
commit 65e938035f
2 changed files with 82 additions and 7 deletions
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1845,6 +1845,41 @@ Return ONLY the summary, nothing else."#,
            description
        );

+        // Pre-flight similarity check — surface near-duplicates to the model
+        // before it commits to a new entity. Uses the first name token as the
+        // search term so "Sarah" matches when storing "Sarah Johnson" and vice
+        // versa. Exact-name matches are excluded (upsert_entity deduplicates
+        // those already). Results are appended to the tool response so the
+        // model can choose to use an existing entity's ID instead.
+        let similar_entities: Vec<String> = {
+            use crate::database::{EntityFilter, KnowledgeDao};
+            use crate::database::knowledge_dao::normalize_entity_type;
+            let normalised_type = normalize_entity_type(&entity_type);
+            let first_token = name
+                .split_whitespace()
+                .next()
+                .unwrap_or(&name)
+                .to_string();
+            let filter = EntityFilter {
+                entity_type: None, // search all types, filter client-side to avoid case issues
+                status: Some("active".to_string()),
+                search: Some(first_token),
+                limit: 10,
+                offset: 0,
+            };
+            let mut kdao = self.knowledge_dao.lock().expect("Unable to lock KnowledgeDao");
+            kdao.list_entities(cx, filter)
+                .unwrap_or_default()
+                .0
+                .into_iter()
+                .filter(|e| {
+                    normalize_entity_type(&e.entity_type) == normalised_type
+                        && e.name.to_lowercase() != name.to_lowercase()
+                })
+                .map(|e| format!("  ID:{} | {} | {}", e.id, e.name, e.description))
+                .collect()
+        };
+
        // Generate embedding for name + description (best-effort)
        let embed_text = format!("{} {}", name, description);
        let embedding: Option<Vec<u8>> = match ollama.generate_embedding(&embed_text).await {
@@ -1875,10 +1910,22 @@ Return ONLY the summary, nothing else."#,
            .lock()
            .expect("Unable to lock KnowledgeDao");
        match kdao.upsert_entity(cx, insert) {
-            Ok(entity) => format!(
-                "Entity stored: ID:{} | {} | {} | confidence:{:.2}",
-                entity.id, entity.entity_type, entity.name, entity.confidence
-            ),
+            Ok(entity) => {
+                let mut response = format!(
+                    "Entity stored: ID:{} | {} | {} | confidence:{:.2}",
+                    entity.id, entity.entity_type, entity.name, entity.confidence
+                );
+                if !similar_entities.is_empty() {
+                    response.push_str(
+                        "\nSimilar existing entities found — verify this is not a duplicate:\n",
+                    );
+                    response.push_str(&similar_entities.join("\n"));
+                    response.push_str(
+                        "\nIf one of these is the same entity, use their existing ID in store_fact instead of the newly created one.",
+                    );
+                }
+                response
+            }
            Err(e) => format!("Error storing entity: {:?}", e),
        }
    }