Daily Summary Embedding Testing

This commit is contained in:
Cameron
2026-01-08 13:41:32 -05:00
parent 61e10f7678
commit 084994e0b5
8 changed files with 1000 additions and 106 deletions

View File

@@ -10,6 +10,75 @@ use crate::ai::{OllamaClient, SmsApiClient, SmsMessage};
use crate::database::{DailySummaryDao, InsertDailySummary};
use crate::otel::global_tracer;
/// Strip boilerplate prefixes and common phrases from summaries before embedding.
/// This improves embedding diversity by removing structural similarity.
pub fn strip_summary_boilerplate(summary: &str) -> String {
let mut text = summary.trim().to_string();
// Remove markdown headers
while text.starts_with('#') {
if let Some(pos) = text.find('\n') {
text = text[pos..].trim_start().to_string();
} else {
// Single line with just headers, try to extract content after #s
text = text.trim_start_matches('#').trim().to_string();
break;
}
}
// Remove "Summary:" prefix variations (with optional markdown bold)
let prefixes = [
"**Summary:**",
"**Summary**:",
"*Summary:*",
"Summary:",
"**summary:**",
"summary:",
];
for prefix in prefixes {
if text.to_lowercase().starts_with(&prefix.to_lowercase()) {
text = text[prefix.len()..].trim_start().to_string();
break;
}
}
// Remove common opening phrases that add no semantic value
let opening_phrases = [
"Today, Melissa and I discussed",
"Today, Amanda and I discussed",
"Today Melissa and I discussed",
"Today Amanda and I discussed",
"Melissa and I discussed",
"Amanda and I discussed",
"Today, I discussed",
"Today I discussed",
"The conversation covered",
"This conversation covered",
"In this conversation,",
"During this conversation,",
];
for phrase in opening_phrases {
if text.to_lowercase().starts_with(&phrase.to_lowercase()) {
text = text[phrase.len()..].trim_start().to_string();
// Remove leading punctuation/articles after stripping phrase
text = text.trim_start_matches(|c| c == ',' || c == ':' || c == '-').trim_start().to_string();
break;
}
}
// Remove any remaining leading markdown bold markers
if text.starts_with("**") {
if let Some(end) = text[2..].find("**") {
// Keep the content between ** but remove the markers
let bold_content = &text[2..2 + end];
text = format!("{}{}", bold_content, &text[4 + end..]);
}
}
text.trim().to_string()
}
/// Generate and embed daily conversation summaries for a date range
/// Default: August 2024 ±30 days (July 1 - September 30, 2024)
pub async fn generate_daily_summaries(
@@ -238,22 +307,34 @@ async fn generate_and_store_daily_summary(
let weekday = date.format("%A");
let prompt = format!(
r#"Summarize this day's conversation in 3-5 sentences. Focus on:
- Key topics, activities, and events discussed
- Places, people, or organizations mentioned
- Plans made or decisions discussed
- Overall mood or themes of the day
r#"Summarize this day's conversation between me and {}.
IMPORTANT: Clearly distinguish between what "I" or "Me" did versus what {} did.
Always explicitly attribute actions, plans, and activities to the correct person.
Use "I" or "Me" for my actions and "{}" for their actions.
CRITICAL FORMAT RULES:
- Do NOT start with "Based on the conversation..." or "Here is a summary..." or similar preambles
- Do NOT repeat the date at the beginning
- Start DIRECTLY with the content - begin with a person's name or action
- Write in past tense, as if recording what happened
NARRATIVE (3-5 sentences):
- What specific topics, activities, or events were discussed?
- What places, people, or organizations were mentioned?
- What plans were made or decisions discussed?
- Clearly distinguish between what "I" did versus what {} did
KEYWORDS (comma-separated):
5-10 specific keywords that capture this conversation's unique content:
- Proper nouns (people, places, brands)
- Specific activities ("drum corps audition" not just "music")
- Distinctive terms that make this day unique
Date: {} ({})
Messages:
{}
Write a natural, informative summary with clear subject attribution.
Summary:"#,
YOUR RESPONSE (follow this format EXACTLY):
Summary: [Start directly with content, NO preamble]
Keywords: [specific, unique terms]"#,
contact,
contact,
date.format("%B %d, %Y"),
@@ -265,7 +346,7 @@ Summary:"#,
let summary = ollama
.generate(
&prompt,
Some("You are a conversation summarizer. Create clear, factual summaries that maintain precise subject attribution - clearly distinguishing who said or did what."),
Some("You are a conversation summarizer. Create clear, factual summaries with precise subject attribution AND extract distinctive keywords. Focus on specific, unique terms that differentiate this conversation from others."),
)
.await?;
@@ -277,8 +358,15 @@ Summary:"#,
span.set_attribute(KeyValue::new("summary_length", summary.len() as i64));
// Embed the summary
let embedding = ollama.generate_embedding(&summary).await?;
// Strip boilerplate before embedding to improve vector diversity
let stripped_summary = strip_summary_boilerplate(&summary);
log::debug!(
"Stripped summary for embedding: {}",
stripped_summary.chars().take(100).collect::<String>()
);
// Embed the stripped summary (store original summary in DB)
let embedding = ollama.generate_embedding(&stripped_summary).await?;
span.set_attribute(KeyValue::new(
"embedding_dimensions",
@@ -293,7 +381,8 @@ Summary:"#,
message_count: messages.len() as i32,
embedding,
created_at: Utc::now().timestamp(),
model_version: "nomic-embed-text:v1.5".to_string(),
// model_version: "nomic-embed-text:v1.5".to_string(),
model_version: "mxbai-embed-large:335m".to_string(),
};
// Create context from current span for DB operation