Daily Summary Embedding Testing
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
use chrono::NaiveDate;
|
||||
use diesel::prelude::*;
|
||||
use diesel::sqlite::SqliteConnection;
|
||||
use serde::Serialize;
|
||||
@@ -47,6 +48,17 @@ pub trait DailySummaryDao: Sync + Send {
|
||||
limit: usize,
|
||||
) -> Result<Vec<DailySummary>, DbError>;
|
||||
|
||||
/// Find semantically similar daily summaries with time-based weighting
|
||||
/// Combines cosine similarity with temporal proximity to target_date
|
||||
/// Final score = similarity * time_weight, where time_weight decays with distance from target_date
|
||||
fn find_similar_summaries_with_time_weight(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
query_embedding: &[f32],
|
||||
target_date: &str,
|
||||
limit: usize,
|
||||
) -> Result<Vec<DailySummary>, DbError>;
|
||||
|
||||
/// Check if a summary exists for a given date and contact
|
||||
fn summary_exists(
|
||||
&mut self,
|
||||
@@ -231,14 +243,22 @@ impl DailySummaryDao for SqliteDailySummaryDao {
|
||||
// Sort by similarity (highest first)
|
||||
scored_summaries.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
// Filter out poor matches (similarity < 0.3 is likely noise)
|
||||
scored_summaries.retain(|(similarity, _)| *similarity >= 0.3);
|
||||
|
||||
// Log similarity distribution
|
||||
if !scored_summaries.is_empty() {
|
||||
let top_score = scored_summaries.first().map(|(s, _)| *s).unwrap_or(0.0);
|
||||
let median_score = scored_summaries.get(scored_summaries.len() / 2).map(|(s, _)| *s).unwrap_or(0.0);
|
||||
|
||||
log::info!(
|
||||
"Daily summary similarity - Top: {:.3}, Median: {:.3}, Count: {}",
|
||||
scored_summaries.first().map(|(s, _)| *s).unwrap_or(0.0),
|
||||
scored_summaries.get(scored_summaries.len() / 2).map(|(s, _)| *s).unwrap_or(0.0),
|
||||
"Daily summary similarity - Top: {:.3}, Median: {:.3}, Count: {} (after 0.3 threshold)",
|
||||
top_score,
|
||||
median_score,
|
||||
scored_summaries.len()
|
||||
);
|
||||
} else {
|
||||
log::warn!("No daily summaries met the 0.3 similarity threshold");
|
||||
}
|
||||
|
||||
// Take top N and log matches
|
||||
@@ -262,6 +282,128 @@ impl DailySummaryDao for SqliteDailySummaryDao {
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn find_similar_summaries_with_time_weight(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
query_embedding: &[f32],
|
||||
target_date: &str,
|
||||
limit: usize,
|
||||
) -> Result<Vec<DailySummary>, DbError> {
|
||||
trace_db_call(context, "query", "find_similar_summaries_with_time_weight", |_span| {
|
||||
let mut conn = self.connection.lock().expect("Unable to get DailySummaryDao");
|
||||
|
||||
if query_embedding.len() != 768 {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Invalid query embedding dimensions: {} (expected 768)",
|
||||
query_embedding.len()
|
||||
));
|
||||
}
|
||||
|
||||
// Parse target date
|
||||
let target = NaiveDate::parse_from_str(target_date, "%Y-%m-%d")
|
||||
.map_err(|e| anyhow::anyhow!("Invalid target date: {}", e))?;
|
||||
|
||||
// Load all summaries with embeddings
|
||||
let results = diesel::sql_query(
|
||||
"SELECT id, date, contact, summary, message_count, embedding, created_at, model_version
|
||||
FROM daily_conversation_summaries"
|
||||
)
|
||||
.load::<DailySummaryWithVectorRow>(conn.deref_mut())
|
||||
.map_err(|e| anyhow::anyhow!("Query error: {:?}", e))?;
|
||||
|
||||
log::info!("Loaded {} daily summaries for time-weighted similarity (target: {})", results.len(), target_date);
|
||||
|
||||
// Compute time-weighted similarity for each summary
|
||||
// Score = cosine_similarity * time_weight
|
||||
// time_weight = 1 / (1 + days_distance/30) - decays with ~30 day half-life
|
||||
let mut scored_summaries: Vec<(f32, f32, i64, DailySummary)> = results
|
||||
.into_iter()
|
||||
.filter_map(|row| {
|
||||
match Self::deserialize_vector(&row.embedding) {
|
||||
Ok(embedding) => {
|
||||
let similarity = Self::cosine_similarity(query_embedding, &embedding);
|
||||
|
||||
// Calculate time weight
|
||||
let summary_date = NaiveDate::parse_from_str(&row.date, "%Y-%m-%d").ok()?;
|
||||
let days_distance = (target - summary_date).num_days().abs();
|
||||
|
||||
// Exponential decay with 30-day half-life
|
||||
// At 0 days: weight = 1.0
|
||||
// At 30 days: weight = 0.5
|
||||
// At 60 days: weight = 0.25
|
||||
// At 365 days: weight ~= 0.0001
|
||||
let time_weight = 0.5_f32.powf(days_distance as f32 / 30.0);
|
||||
|
||||
// Combined score - but ensure semantic similarity still matters
|
||||
// We use sqrt to soften the time weight's impact
|
||||
let combined_score = similarity * time_weight.sqrt();
|
||||
|
||||
Some((
|
||||
combined_score,
|
||||
similarity,
|
||||
days_distance,
|
||||
DailySummary {
|
||||
id: row.id,
|
||||
date: row.date,
|
||||
contact: row.contact,
|
||||
summary: row.summary,
|
||||
message_count: row.message_count,
|
||||
created_at: row.created_at,
|
||||
model_version: row.model_version,
|
||||
},
|
||||
))
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!("Failed to deserialize embedding for summary {}: {:?}", row.id, e);
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by combined score (highest first)
|
||||
scored_summaries.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
// Filter out poor matches (base similarity < 0.5 - stricter than before since we have time weighting)
|
||||
scored_summaries.retain(|(_, similarity, _, _)| *similarity >= 0.5);
|
||||
|
||||
// Log similarity distribution
|
||||
if !scored_summaries.is_empty() {
|
||||
let (top_combined, top_sim, top_days, _) = &scored_summaries[0];
|
||||
log::info!(
|
||||
"Time-weighted similarity - Top: combined={:.3} (sim={:.3}, days={}), Count: {} matches",
|
||||
top_combined,
|
||||
top_sim,
|
||||
top_days,
|
||||
scored_summaries.len()
|
||||
);
|
||||
} else {
|
||||
log::warn!("No daily summaries met the 0.5 similarity threshold");
|
||||
}
|
||||
|
||||
// Take top N and log matches
|
||||
let top_results: Vec<DailySummary> = scored_summaries
|
||||
.into_iter()
|
||||
.take(limit)
|
||||
.map(|(combined, similarity, days, summary)| {
|
||||
log::info!(
|
||||
"Summary match: combined={:.3} (sim={:.3}, days={}), date={}, contact={}, summary=\"{}\"",
|
||||
combined,
|
||||
similarity,
|
||||
days,
|
||||
summary.date,
|
||||
summary.contact,
|
||||
summary.summary.chars().take(80).collect::<String>()
|
||||
);
|
||||
summary
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(top_results)
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn summary_exists(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
|
||||
Reference in New Issue
Block a user