knowledge: cosine dedup, fact create endpoint, recall nudge

Phase 1 of the knowledge curation work. Three small server-side changes
to support an Apollo-side curation surface and reduce the agent's near-
duplicate output rate going forward:

- upsert_entity grows an embedding-cosine fallback after the exact name
  match misses. New entities whose embedding sits above
  ENTITY_DEDUP_COSINE_THRESHOLD (default 0.92) against any same-type
  active entity collapse onto the existing row. Eliminates the Sarah /
  Sara / Sarah J. trio the FTS5 prefix check was missing.
- POST /knowledge/facts symmetric with the existing PATCH/DELETE so the
  curation UI can create facts directly. Persona-scoped via X-Persona-Id;
  validates subject (and optional object) entity existence; reuses
  KnowledgeDao::upsert_fact so corroboration semantics match the agent
  path.
- One sentence in build_system_content telling the agent to call
  recall_entities before store_entity when a name resembles something
  already known. Cheap; complements the DAO-layer guard.

Includes upsert_entity_collapses_near_duplicate_by_embedding test
covering both the collapse-on-near-match path and the don't-collapse-on-
unrelated-embedding path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-10 15:16:05 -04:00
parent 827a78dd79
commit d7aee4f228
3 changed files with 252 additions and 3 deletions

View File

@@ -282,6 +282,20 @@ impl SqliteKnowledgeDao {
}
}
/// Cosine-similarity threshold above which a new entity collapses into an
/// existing same-type entity at upsert time. The agent's pre-flight name
/// search uses FTS5 prefix tokens, which misses near-dupes like
/// "Sarah" / "Sara" / "Sarah J." that share a description-rich embedding.
/// Override via `ENTITY_DEDUP_COSINE_THRESHOLD` env var when tuning.
const ENTITY_DEDUP_COSINE_THRESHOLD_DEFAULT: f32 = 0.92;
fn entity_dedup_cosine_threshold() -> f32 {
std::env::var("ENTITY_DEDUP_COSINE_THRESHOLD")
.ok()
.and_then(|v| v.parse::<f32>().ok())
.unwrap_or(ENTITY_DEDUP_COSINE_THRESHOLD_DEFAULT)
}
impl KnowledgeDao for SqliteKnowledgeDao {
// -----------------------------------------------------------------------
// Entity operations
@@ -308,7 +322,7 @@ impl KnowledgeDao for SqliteKnowledgeDao {
// Use lower() on both sides so existing dirty rows ("Person") still match.
let name_lower = entity.name.to_lowercase();
let type_lower = entity.entity_type.to_lowercase();
let existing: Option<Entity> = entities
let mut existing: Option<Entity> = entities
.filter(diesel::dsl::sql::<diesel::sql_types::Bool>(&format!(
"lower(name) = '{}' AND lower(entity_type) = '{}'",
name_lower.replace('\'', "''"),
@@ -318,6 +332,49 @@ impl KnowledgeDao for SqliteKnowledgeDao {
.optional()
.map_err(|e| anyhow::anyhow!("Query error: {}", e))?;
// Fuzzy-match fallback: if no exact name match and the incoming
// entity carries an embedding, compare against same-type entities'
// embeddings and collapse if any are above the cosine threshold.
if existing.is_none()
&& let Some(new_emb_bytes) = entity.embedding.as_ref()
&& let Ok(new_vec) = Self::deserialize_embedding(new_emb_bytes)
&& !new_vec.is_empty()
{
let threshold = entity_dedup_cosine_threshold();
let candidates: Vec<Entity> = entities
.filter(embedding.is_not_null())
.filter(entity_type.eq(&entity.entity_type))
.filter(status.ne("rejected"))
.load::<Entity>(conn.deref_mut())
.map_err(|e| anyhow::anyhow!("Query error: {}", e))?;
let mut best: Option<(Entity, f32)> = None;
for cand in candidates {
let Some(cand_bytes) = cand.embedding.as_ref() else {
continue;
};
let Ok(cand_vec) = Self::deserialize_embedding(cand_bytes) else {
continue;
};
let sim = Self::cosine_similarity(&new_vec, &cand_vec);
if sim >= threshold && best.as_ref().is_none_or(|(_, s)| sim > *s) {
best = Some((cand, sim));
}
}
if let Some((cand, sim)) = best {
log::info!(
"entity dedup: collapsing new '{}' ({}) into existing '{}' (id={}, cos={:.3})",
entity.name,
entity.entity_type,
cand.name,
cand.id,
sim
);
existing = Some(cand);
}
}
if let Some(existing_entity) = existing {
// Update description, embedding, updated_at
diesel::update(entities.filter(id.eq(existing_entity.id)))
@@ -1276,4 +1333,87 @@ mod tests {
"FK should reject fact whose persona doesn't exist"
);
}
#[test]
fn upsert_entity_collapses_near_duplicate_by_embedding() {
// The agent's pre-flight check uses FTS5 prefix tokens, which
// miss "Sarah" / "Sara" / "Sarah J." pairs. The DAO upsert is
// the safety net: if no exact (name, type) match but the new
// entity's embedding sits above the cosine threshold against an
// existing same-type entity, we collapse instead of inserting.
let cx = opentelemetry::Context::new();
let conn = connection_with_fks_on();
let mut dao = SqliteKnowledgeDao::from_connection(conn.clone());
let mut emb_a = vec![0.0_f32; 64];
emb_a[0] = 1.0;
emb_a[1] = 0.5;
let mut emb_b_near = emb_a.clone();
emb_b_near[2] = 0.05; // nudge — cosine still well above 0.92
// Seed an existing entity with the embedding.
let seeded = dao
.upsert_entity(
&cx,
InsertEntity {
name: "Sarah".to_string(),
entity_type: "person".to_string(),
description: "tagged friend".to_string(),
embedding: Some(SqliteKnowledgeDao::serialize_embedding(&emb_a)),
confidence: 0.6,
status: "active".to_string(),
created_at: 0,
updated_at: 0,
},
)
.unwrap();
// A "different name" with a near-identical embedding should
// collapse onto the existing row, not create a new entity.
let collapsed = dao
.upsert_entity(
&cx,
InsertEntity {
name: "Sara".to_string(),
entity_type: "person".to_string(),
description: "tagged friend".to_string(),
embedding: Some(SqliteKnowledgeDao::serialize_embedding(&emb_b_near)),
confidence: 0.6,
status: "active".to_string(),
created_at: 0,
updated_at: 0,
},
)
.unwrap();
assert_eq!(
collapsed.id, seeded.id,
"near-duplicate by cosine should reuse the existing entity id"
);
// And a clearly-different embedding under a different name should
// still create a new row.
let mut emb_unrelated = vec![0.0_f32; 64];
emb_unrelated[10] = 1.0;
let distinct = dao
.upsert_entity(
&cx,
InsertEntity {
name: "Bob".to_string(),
entity_type: "person".to_string(),
description: String::new(),
embedding: Some(SqliteKnowledgeDao::serialize_embedding(&emb_unrelated)),
confidence: 0.6,
status: "active".to_string(),
created_at: 0,
updated_at: 0,
},
)
.unwrap();
assert_ne!(
distinct.id, seeded.id,
"unrelated embedding should not collapse"
);
}
}