diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index e15e5ed..e7af5ca 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -1845,6 +1845,41 @@ Return ONLY the summary, nothing else."#, description ); + // Pre-flight similarity check — surface near-duplicates to the model + // before it commits to a new entity. Uses the first name token as the + // search term so "Sarah" matches when storing "Sarah Johnson" and vice + // versa. Exact-name matches are excluded (upsert_entity deduplicates + // those already). Results are appended to the tool response so the + // model can choose to use an existing entity's ID instead. + let similar_entities: Vec = { + use crate::database::{EntityFilter, KnowledgeDao}; + use crate::database::knowledge_dao::normalize_entity_type; + let normalised_type = normalize_entity_type(&entity_type); + let first_token = name + .split_whitespace() + .next() + .unwrap_or(&name) + .to_string(); + let filter = EntityFilter { + entity_type: None, // search all types, filter client-side to avoid case issues + status: Some("active".to_string()), + search: Some(first_token), + limit: 10, + offset: 0, + }; + let mut kdao = self.knowledge_dao.lock().expect("Unable to lock KnowledgeDao"); + kdao.list_entities(cx, filter) + .unwrap_or_default() + .0 + .into_iter() + .filter(|e| { + normalize_entity_type(&e.entity_type) == normalised_type + && e.name.to_lowercase() != name.to_lowercase() + }) + .map(|e| format!(" ID:{} | {} | {}", e.id, e.name, e.description)) + .collect() + }; + // Generate embedding for name + description (best-effort) let embed_text = format!("{} {}", name, description); let embedding: Option> = match ollama.generate_embedding(&embed_text).await { @@ -1875,10 +1910,22 @@ Return ONLY the summary, nothing else."#, .lock() .expect("Unable to lock KnowledgeDao"); match kdao.upsert_entity(cx, insert) { - Ok(entity) => format!( - "Entity stored: ID:{} | {} | {} | confidence:{:.2}", - entity.id, entity.entity_type, entity.name, entity.confidence - ), + Ok(entity) => { + let mut response = format!( + "Entity stored: ID:{} | {} | {} | confidence:{:.2}", + entity.id, entity.entity_type, entity.name, entity.confidence + ); + if !similar_entities.is_empty() { + response.push_str( + "\nSimilar existing entities found — verify this is not a duplicate:\n", + ); + response.push_str(&similar_entities.join("\n")); + response.push_str( + "\nIf one of these is the same entity, use their existing ID in store_fact instead of the newly created one.", + ); + } + response + } Err(e) => format!("Error storing entity: {:?}", e), } } diff --git a/src/database/knowledge_dao.rs b/src/database/knowledge_dao.rs index 09ffddf..05d1865 100644 --- a/src/database/knowledge_dao.rs +++ b/src/database/knowledge_dao.rs @@ -10,6 +10,25 @@ use crate::database::schema; use crate::database::{DbError, DbErrorKind, connect}; use crate::otel::trace_db_call; +// --------------------------------------------------------------------------- +// Entity type normalisation +// --------------------------------------------------------------------------- + +/// Canonicalise a model-supplied entity_type to a consistent lowercase form. +/// Weak models frequently vary capitalisation ("Person" vs "person") or use +/// synonym types ("location" vs "place"). Normalising here prevents duplicate +/// entities that differ only by type spelling. +pub(crate) fn normalize_entity_type(raw: &str) -> String { + match raw.to_lowercase().as_str() { + "person" | "people" | "human" | "individual" | "contact" => "person", + "place" | "location" | "venue" | "site" | "area" | "landmark" => "place", + "event" | "occasion" | "activity" | "celebration" => "event", + "thing" | "object" | "item" | "product" => "thing", + other => other, + } + .to_string() +} + // --------------------------------------------------------------------------- // Filter / patch types // --------------------------------------------------------------------------- @@ -250,13 +269,22 @@ impl KnowledgeDao for SqliteKnowledgeDao { let mut conn = self.connection.lock().expect("KnowledgeDao lock"); - // Case-insensitive lookup by name + entity_type + // Normalise type before lookup and insert so that model variations + // ("Person" / "person", "location" / "place") collapse to one row. + let entity = InsertEntity { + entity_type: normalize_entity_type(&entity.entity_type), + ..entity + }; + + // Case-insensitive lookup by name + entity_type. + // Use lower() on both sides so existing dirty rows ("Person") still match. let name_lower = entity.name.to_lowercase(); + let type_lower = entity.entity_type.to_lowercase(); let existing: Option = entities .filter(diesel::dsl::sql::(&format!( - "lower(name) = '{}' AND entity_type = '{}'", + "lower(name) = '{}' AND lower(entity_type) = '{}'", name_lower.replace('\'', "''"), - entity.entity_type.replace('\'', "''") + type_lower.replace('\'', "''") ))) .first::(conn.deref_mut()) .optional()