Daily Summary Embedding Testing

This commit is contained in:
Cameron
2026-01-08 13:41:32 -05:00
parent 61e10f7678
commit 084994e0b5
8 changed files with 1000 additions and 106 deletions

View File

@@ -1,3 +1,4 @@
use chrono::NaiveDate;
use diesel::prelude::*;
use diesel::sqlite::SqliteConnection;
use serde::Serialize;
@@ -47,6 +48,17 @@ pub trait DailySummaryDao: Sync + Send {
limit: usize,
) -> Result<Vec<DailySummary>, DbError>;
/// Find semantically similar daily summaries with time-based weighting
/// Combines cosine similarity with temporal proximity to target_date
/// Final score = similarity * time_weight, where time_weight decays with distance from target_date
fn find_similar_summaries_with_time_weight(
&mut self,
context: &opentelemetry::Context,
query_embedding: &[f32],
target_date: &str,
limit: usize,
) -> Result<Vec<DailySummary>, DbError>;
/// Check if a summary exists for a given date and contact
fn summary_exists(
&mut self,
@@ -231,14 +243,22 @@ impl DailySummaryDao for SqliteDailySummaryDao {
// Sort by similarity (highest first)
scored_summaries.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
// Filter out poor matches (similarity < 0.3 is likely noise)
scored_summaries.retain(|(similarity, _)| *similarity >= 0.3);
// Log similarity distribution
if !scored_summaries.is_empty() {
let top_score = scored_summaries.first().map(|(s, _)| *s).unwrap_or(0.0);
let median_score = scored_summaries.get(scored_summaries.len() / 2).map(|(s, _)| *s).unwrap_or(0.0);
log::info!(
"Daily summary similarity - Top: {:.3}, Median: {:.3}, Count: {}",
scored_summaries.first().map(|(s, _)| *s).unwrap_or(0.0),
scored_summaries.get(scored_summaries.len() / 2).map(|(s, _)| *s).unwrap_or(0.0),
"Daily summary similarity - Top: {:.3}, Median: {:.3}, Count: {} (after 0.3 threshold)",
top_score,
median_score,
scored_summaries.len()
);
} else {
log::warn!("No daily summaries met the 0.3 similarity threshold");
}
// Take top N and log matches
@@ -262,6 +282,128 @@ impl DailySummaryDao for SqliteDailySummaryDao {
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn find_similar_summaries_with_time_weight(
&mut self,
context: &opentelemetry::Context,
query_embedding: &[f32],
target_date: &str,
limit: usize,
) -> Result<Vec<DailySummary>, DbError> {
trace_db_call(context, "query", "find_similar_summaries_with_time_weight", |_span| {
let mut conn = self.connection.lock().expect("Unable to get DailySummaryDao");
if query_embedding.len() != 768 {
return Err(anyhow::anyhow!(
"Invalid query embedding dimensions: {} (expected 768)",
query_embedding.len()
));
}
// Parse target date
let target = NaiveDate::parse_from_str(target_date, "%Y-%m-%d")
.map_err(|e| anyhow::anyhow!("Invalid target date: {}", e))?;
// Load all summaries with embeddings
let results = diesel::sql_query(
"SELECT id, date, contact, summary, message_count, embedding, created_at, model_version
FROM daily_conversation_summaries"
)
.load::<DailySummaryWithVectorRow>(conn.deref_mut())
.map_err(|e| anyhow::anyhow!("Query error: {:?}", e))?;
log::info!("Loaded {} daily summaries for time-weighted similarity (target: {})", results.len(), target_date);
// Compute time-weighted similarity for each summary
// Score = cosine_similarity * time_weight
// time_weight = 1 / (1 + days_distance/30) - decays with ~30 day half-life
let mut scored_summaries: Vec<(f32, f32, i64, DailySummary)> = results
.into_iter()
.filter_map(|row| {
match Self::deserialize_vector(&row.embedding) {
Ok(embedding) => {
let similarity = Self::cosine_similarity(query_embedding, &embedding);
// Calculate time weight
let summary_date = NaiveDate::parse_from_str(&row.date, "%Y-%m-%d").ok()?;
let days_distance = (target - summary_date).num_days().abs();
// Exponential decay with 30-day half-life
// At 0 days: weight = 1.0
// At 30 days: weight = 0.5
// At 60 days: weight = 0.25
// At 365 days: weight ~= 0.0001
let time_weight = 0.5_f32.powf(days_distance as f32 / 30.0);
// Combined score - but ensure semantic similarity still matters
// We use sqrt to soften the time weight's impact
let combined_score = similarity * time_weight.sqrt();
Some((
combined_score,
similarity,
days_distance,
DailySummary {
id: row.id,
date: row.date,
contact: row.contact,
summary: row.summary,
message_count: row.message_count,
created_at: row.created_at,
model_version: row.model_version,
},
))
}
Err(e) => {
log::warn!("Failed to deserialize embedding for summary {}: {:?}", row.id, e);
None
}
}
})
.collect();
// Sort by combined score (highest first)
scored_summaries.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
// Filter out poor matches (base similarity < 0.5 - stricter than before since we have time weighting)
scored_summaries.retain(|(_, similarity, _, _)| *similarity >= 0.5);
// Log similarity distribution
if !scored_summaries.is_empty() {
let (top_combined, top_sim, top_days, _) = &scored_summaries[0];
log::info!(
"Time-weighted similarity - Top: combined={:.3} (sim={:.3}, days={}), Count: {} matches",
top_combined,
top_sim,
top_days,
scored_summaries.len()
);
} else {
log::warn!("No daily summaries met the 0.5 similarity threshold");
}
// Take top N and log matches
let top_results: Vec<DailySummary> = scored_summaries
.into_iter()
.take(limit)
.map(|(combined, similarity, days, summary)| {
log::info!(
"Summary match: combined={:.3} (sim={:.3}, days={}), date={}, contact={}, summary=\"{}\"",
combined,
similarity,
days,
summary.date,
summary.contact,
summary.summary.chars().take(80).collect::<String>()
);
summary
})
.collect();
Ok(top_results)
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn summary_exists(
&mut self,
context: &opentelemetry::Context,

View File

@@ -18,15 +18,11 @@ pub mod models;
pub mod schema;
pub mod search_dao;
pub use calendar_dao::{
CalendarEventDao, SqliteCalendarEventDao,
};
pub use calendar_dao::{CalendarEventDao, SqliteCalendarEventDao};
pub use daily_summary_dao::{DailySummaryDao, InsertDailySummary, SqliteDailySummaryDao};
pub use embeddings_dao::{EmbeddingDao, InsertMessageEmbedding};
pub use insights_dao::{InsightDao, SqliteInsightDao};
pub use location_dao::{
LocationHistoryDao, SqliteLocationHistoryDao,
};
pub use location_dao::{LocationHistoryDao, SqliteLocationHistoryDao};
pub use search_dao::{SearchHistoryDao, SqliteSearchHistoryDao};
pub trait UserDao {