Daily Summary Embedding Testing
This commit is contained in:
@@ -10,6 +10,75 @@ use crate::ai::{OllamaClient, SmsApiClient, SmsMessage};
|
||||
use crate::database::{DailySummaryDao, InsertDailySummary};
|
||||
use crate::otel::global_tracer;
|
||||
|
||||
/// Strip boilerplate prefixes and common phrases from summaries before embedding.
|
||||
/// This improves embedding diversity by removing structural similarity.
|
||||
pub fn strip_summary_boilerplate(summary: &str) -> String {
|
||||
let mut text = summary.trim().to_string();
|
||||
|
||||
// Remove markdown headers
|
||||
while text.starts_with('#') {
|
||||
if let Some(pos) = text.find('\n') {
|
||||
text = text[pos..].trim_start().to_string();
|
||||
} else {
|
||||
// Single line with just headers, try to extract content after #s
|
||||
text = text.trim_start_matches('#').trim().to_string();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove "Summary:" prefix variations (with optional markdown bold)
|
||||
let prefixes = [
|
||||
"**Summary:**",
|
||||
"**Summary**:",
|
||||
"*Summary:*",
|
||||
"Summary:",
|
||||
"**summary:**",
|
||||
"summary:",
|
||||
];
|
||||
for prefix in prefixes {
|
||||
if text.to_lowercase().starts_with(&prefix.to_lowercase()) {
|
||||
text = text[prefix.len()..].trim_start().to_string();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove common opening phrases that add no semantic value
|
||||
let opening_phrases = [
|
||||
"Today, Melissa and I discussed",
|
||||
"Today, Amanda and I discussed",
|
||||
"Today Melissa and I discussed",
|
||||
"Today Amanda and I discussed",
|
||||
"Melissa and I discussed",
|
||||
"Amanda and I discussed",
|
||||
"Today, I discussed",
|
||||
"Today I discussed",
|
||||
"The conversation covered",
|
||||
"This conversation covered",
|
||||
"In this conversation,",
|
||||
"During this conversation,",
|
||||
];
|
||||
|
||||
for phrase in opening_phrases {
|
||||
if text.to_lowercase().starts_with(&phrase.to_lowercase()) {
|
||||
text = text[phrase.len()..].trim_start().to_string();
|
||||
// Remove leading punctuation/articles after stripping phrase
|
||||
text = text.trim_start_matches(|c| c == ',' || c == ':' || c == '-').trim_start().to_string();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove any remaining leading markdown bold markers
|
||||
if text.starts_with("**") {
|
||||
if let Some(end) = text[2..].find("**") {
|
||||
// Keep the content between ** but remove the markers
|
||||
let bold_content = &text[2..2 + end];
|
||||
text = format!("{}{}", bold_content, &text[4 + end..]);
|
||||
}
|
||||
}
|
||||
|
||||
text.trim().to_string()
|
||||
}
|
||||
|
||||
/// Generate and embed daily conversation summaries for a date range
|
||||
/// Default: August 2024 ±30 days (July 1 - September 30, 2024)
|
||||
pub async fn generate_daily_summaries(
|
||||
@@ -238,22 +307,34 @@ async fn generate_and_store_daily_summary(
|
||||
let weekday = date.format("%A");
|
||||
|
||||
let prompt = format!(
|
||||
r#"Summarize this day's conversation in 3-5 sentences. Focus on:
|
||||
- Key topics, activities, and events discussed
|
||||
- Places, people, or organizations mentioned
|
||||
- Plans made or decisions discussed
|
||||
- Overall mood or themes of the day
|
||||
r#"Summarize this day's conversation between me and {}.
|
||||
|
||||
IMPORTANT: Clearly distinguish between what "I" or "Me" did versus what {} did.
|
||||
Always explicitly attribute actions, plans, and activities to the correct person.
|
||||
Use "I" or "Me" for my actions and "{}" for their actions.
|
||||
CRITICAL FORMAT RULES:
|
||||
- Do NOT start with "Based on the conversation..." or "Here is a summary..." or similar preambles
|
||||
- Do NOT repeat the date at the beginning
|
||||
- Start DIRECTLY with the content - begin with a person's name or action
|
||||
- Write in past tense, as if recording what happened
|
||||
|
||||
NARRATIVE (3-5 sentences):
|
||||
- What specific topics, activities, or events were discussed?
|
||||
- What places, people, or organizations were mentioned?
|
||||
- What plans were made or decisions discussed?
|
||||
- Clearly distinguish between what "I" did versus what {} did
|
||||
|
||||
KEYWORDS (comma-separated):
|
||||
5-10 specific keywords that capture this conversation's unique content:
|
||||
- Proper nouns (people, places, brands)
|
||||
- Specific activities ("drum corps audition" not just "music")
|
||||
- Distinctive terms that make this day unique
|
||||
|
||||
Date: {} ({})
|
||||
Messages:
|
||||
{}
|
||||
|
||||
Write a natural, informative summary with clear subject attribution.
|
||||
Summary:"#,
|
||||
YOUR RESPONSE (follow this format EXACTLY):
|
||||
Summary: [Start directly with content, NO preamble]
|
||||
|
||||
Keywords: [specific, unique terms]"#,
|
||||
contact,
|
||||
contact,
|
||||
date.format("%B %d, %Y"),
|
||||
@@ -265,7 +346,7 @@ Summary:"#,
|
||||
let summary = ollama
|
||||
.generate(
|
||||
&prompt,
|
||||
Some("You are a conversation summarizer. Create clear, factual summaries that maintain precise subject attribution - clearly distinguishing who said or did what."),
|
||||
Some("You are a conversation summarizer. Create clear, factual summaries with precise subject attribution AND extract distinctive keywords. Focus on specific, unique terms that differentiate this conversation from others."),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -277,8 +358,15 @@ Summary:"#,
|
||||
|
||||
span.set_attribute(KeyValue::new("summary_length", summary.len() as i64));
|
||||
|
||||
// Embed the summary
|
||||
let embedding = ollama.generate_embedding(&summary).await?;
|
||||
// Strip boilerplate before embedding to improve vector diversity
|
||||
let stripped_summary = strip_summary_boilerplate(&summary);
|
||||
log::debug!(
|
||||
"Stripped summary for embedding: {}",
|
||||
stripped_summary.chars().take(100).collect::<String>()
|
||||
);
|
||||
|
||||
// Embed the stripped summary (store original summary in DB)
|
||||
let embedding = ollama.generate_embedding(&stripped_summary).await?;
|
||||
|
||||
span.set_attribute(KeyValue::new(
|
||||
"embedding_dimensions",
|
||||
@@ -293,7 +381,8 @@ Summary:"#,
|
||||
message_count: messages.len() as i32,
|
||||
embedding,
|
||||
created_at: Utc::now().timestamp(),
|
||||
model_version: "nomic-embed-text:v1.5".to_string(),
|
||||
// model_version: "nomic-embed-text:v1.5".to_string(),
|
||||
model_version: "mxbai-embed-large:335m".to_string(),
|
||||
};
|
||||
|
||||
// Create context from current span for DB operation
|
||||
|
||||
Reference in New Issue
Block a user