Pass image as additional Insight context

2026-01-10 11:30:01 -05:00
parent 084994e0b5
commit b2cc617bc2
9 changed files with 295 additions and 56 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1808,6 +1808,7 @@ dependencies = [
 "actix-web",
 "actix-web-prom",
 "anyhow",
+ "base64",
 "bcrypt",
 "chrono",
 "clap",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,3 +54,4 @@ urlencoding = "2.1"
 zerocopy = "0.8"
 ical = "0.11"
 scraper = "0.20"
+base64 = "0.22"
--- a/src/ai/daily_summary_job.rs
+++ b/src/ai/daily_summary_job.rs
@@ -62,7 +62,10 @@ pub fn strip_summary_boilerplate(summary: &str) -> String {
        if text.to_lowercase().starts_with(&phrase.to_lowercase()) {
            text = text[phrase.len()..].trim_start().to_string();
            // Remove leading punctuation/articles after stripping phrase
-            text = text.trim_start_matches(|c| c == ',' || c == ':' || c == '-').trim_start().to_string();
+            text = text
+                .trim_start_matches(|c| c == ',' || c == ':' || c == '-')
+                .trim_start()
+                .to_string();
            break;
        }
    }
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -14,6 +14,10 @@ pub struct GeneratePhotoInsightRequest {
    pub file_path: String,
    #[serde(default)]
    pub model: Option<String>,
+    #[serde(default)]
+    pub system_prompt: Option<String>,
+    #[serde(default)]
+    pub num_ctx: Option<i32>,
 }

 #[derive(Debug, Deserialize)]
@@ -63,16 +67,30 @@ pub async fn generate_insight_handler(
    if let Some(ref model) = request.model {
        span.set_attribute(KeyValue::new("model", model.clone()));
    }
+    if let Some(ref prompt) = request.system_prompt {
+        span.set_attribute(KeyValue::new("has_custom_prompt", true));
+        span.set_attribute(KeyValue::new("prompt_length", prompt.len() as i64));
+    }
+    if let Some(ctx) = request.num_ctx {
+        span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
+    }

    log::info!(
-        "Manual insight generation triggered for photo: {} with model: {:?}",
+        "Manual insight generation triggered for photo: {} with model: {:?}, custom_prompt: {}, num_ctx: {:?}",
        normalized_path,
-        request.model
+        request.model,
+        request.system_prompt.is_some(),
+        request.num_ctx
    );

-    // Generate insight with optional custom model
+    // Generate insight with optional custom model, system prompt, and context size
    let result = insight_generator
-        .generate_insight_for_photo_with_model(&normalized_path, request.model.clone())
+        .generate_insight_for_photo_with_config(
+            &normalized_path,
+            request.model.clone(),
+            request.system_prompt.clone(),
+            request.num_ctx,
+        )
        .await;

    match result {
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1,9 +1,12 @@
 use anyhow::Result;
+use base64::Engine as _;
 use chrono::{DateTime, Utc};
+use image::ImageFormat;
 use opentelemetry::KeyValue;
 use opentelemetry::trace::{Span, Status, TraceContextExt, Tracer};
 use serde::Deserialize;
 use std::fs::File;
+use std::io::Cursor;
 use std::sync::{Arc, Mutex};

 use crate::ai::ollama::OllamaClient;
@@ -92,6 +95,51 @@ impl InsightGenerator {
        None
    }

+    /// Load image file, resize it, and encode as base64 for vision models
+    /// Resizes to max 1024px on longest edge to reduce context usage
+    fn load_image_as_base64(&self, file_path: &str) -> Result<String> {
+        use image::imageops::FilterType;
+        use std::path::Path;
+
+        let full_path = Path::new(&self.base_path).join(file_path);
+
+        log::debug!("Loading image for vision model: {:?}", full_path);
+
+        // Open and decode the image
+        let img = image::open(&full_path)
+            .map_err(|e| anyhow::anyhow!("Failed to open image file: {}", e))?;
+
+        let (original_width, original_height) = (img.width(), img.height());
+
+        // Resize to max 1024px on longest edge
+        let resized = img.resize(1024, 1024, FilterType::Lanczos3);
+
+        log::debug!(
+            "Resized image from {}x{} to {}x{}",
+            original_width,
+            original_height,
+            resized.width(),
+            resized.height()
+        );
+
+        // Encode as JPEG at 85% quality
+        let mut buffer = Vec::new();
+        let mut cursor = Cursor::new(&mut buffer);
+        resized
+            .write_to(&mut cursor, ImageFormat::Jpeg)
+            .map_err(|e| anyhow::anyhow!("Failed to encode image as JPEG: {}", e))?;
+
+        let base64_string = base64::engine::general_purpose::STANDARD.encode(&buffer);
+
+        log::debug!(
+            "Encoded image as base64 ({} bytes -> {} chars)",
+            buffer.len(),
+            base64_string.len()
+        );
+
+        Ok(base64_string)
+    }
+
    /// Find relevant messages using RAG, excluding recent messages (>30 days ago)
    /// This prevents RAG from returning messages already in the immediate time window
    async fn find_relevant_messages_rag_historical(
@@ -564,10 +612,23 @@ impl InsightGenerator {
    }

    /// Generate AI insight for a single photo with optional custom model
+    /// (Deprecated: Use generate_insight_for_photo_with_config instead)
    pub async fn generate_insight_for_photo_with_model(
        &self,
        file_path: &str,
        custom_model: Option<String>,
+    ) -> Result<()> {
+        self.generate_insight_for_photo_with_config(file_path, custom_model, None, None)
+            .await
+    }
+
+    /// Generate AI insight for a single photo with custom configuration
+    pub async fn generate_insight_for_photo_with_config(
+        &self,
+        file_path: &str,
+        custom_model: Option<String>,
+        custom_system_prompt: Option<String>,
+        num_ctx: Option<i32>,
    ) -> Result<()> {
        let tracer = global_tracer();
        let current_cx = opentelemetry::Context::current();
@@ -580,7 +641,7 @@ impl InsightGenerator {
        span.set_attribute(KeyValue::new("file_path", file_path.clone()));

        // Create custom Ollama client if model is specified
-        let ollama_client = if let Some(model) = custom_model {
+        let mut ollama_client = if let Some(model) = custom_model {
            log::info!("Using custom model: {}", model);
            span.set_attribute(KeyValue::new("custom_model", model.clone()));
            OllamaClient::new(
@@ -594,6 +655,13 @@ impl InsightGenerator {
            self.ollama.clone()
        };

+        // Set context size if specified
+        if let Some(ctx) = num_ctx {
+            log::info!("Using custom context size: {}", ctx);
+            span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
+            ollama_client.set_num_ctx(Some(ctx));
+        }
+
        // Create context with this span for child operations
        let insight_cx = current_cx.with_span(span);

@@ -740,12 +808,20 @@ impl InsightGenerator {

                        // Step 4: Summarize contexts separately, then combine
                        let immediate_summary = self
-                            .summarize_context_from_messages(&immediate_messages, &ollama_client)
+                            .summarize_context_from_messages(
+                                &immediate_messages,
+                                &ollama_client,
+                                custom_system_prompt.as_deref(),
+                            )
                            .await
                            .unwrap_or_else(|| String::from("No immediate context"));

                        let historical_summary = self
-                            .summarize_messages(&historical_messages, &ollama_client)
+                            .summarize_messages(
+                                &historical_messages,
+                                &ollama_client,
+                                custom_system_prompt.as_deref(),
+                            )
                            .await
                            .unwrap_or_else(|| String::from("No historical context"));

@@ -759,13 +835,21 @@ impl InsightGenerator {
                        // RAG found no historical matches, just use immediate context
                        log::info!("No historical RAG matches, using immediate context only");
                        sms_summary = self
-                            .summarize_context_from_messages(&immediate_messages, &ollama_client)
+                            .summarize_context_from_messages(
+                                &immediate_messages,
+                                &ollama_client,
+                                custom_system_prompt.as_deref(),
+                            )
                            .await;
                    }
                    Err(e) => {
                        log::warn!("Historical RAG failed, using immediate context only: {}", e);
                        sms_summary = self
-                            .summarize_context_from_messages(&immediate_messages, &ollama_client)
+                            .summarize_context_from_messages(
+                                &immediate_messages,
+                                &ollama_client,
+                                custom_system_prompt.as_deref(),
+                            )
                            .await;
                    }
                }
@@ -778,7 +862,13 @@ impl InsightGenerator {
                {
                    Ok(rag_messages) if !rag_messages.is_empty() => {
                        used_rag = true;
-                        sms_summary = self.summarize_messages(&rag_messages, &ollama_client).await;
+                        sms_summary = self
+                            .summarize_messages(
+                                &rag_messages,
+                                &ollama_client,
+                                custom_system_prompt.as_deref(),
+                            )
+                            .await;
                    }
                    _ => {}
                }
@@ -882,13 +972,37 @@ impl InsightGenerator {
            combined_context.len()
        );

-        // 8. Generate title and summary with Ollama (using multi-source context)
+        // 8. Load image and encode as base64 for vision models
+        let image_base64 = match self.load_image_as_base64(&file_path) {
+            Ok(b64) => {
+                log::info!("Successfully loaded image for vision model");
+                Some(b64)
+            }
+            Err(e) => {
+                log::warn!("Failed to load image for vision model: {}", e);
+                None
+            }
+        };
+
+        // 9. Generate title and summary with Ollama (using multi-source context + image)
        let title = ollama_client
-            .generate_photo_title(date_taken, location.as_deref(), Some(&combined_context))
+            .generate_photo_title(
+                date_taken,
+                location.as_deref(),
+                Some(&combined_context),
+                custom_system_prompt.as_deref(),
+                image_base64.clone(),
+            )
            .await?;

        let summary = ollama_client
-            .generate_photo_summary(date_taken, location.as_deref(), Some(&combined_context))
+            .generate_photo_summary(
+                date_taken,
+                location.as_deref(),
+                Some(&combined_context),
+                custom_system_prompt.as_deref(),
+                image_base64,
+            )
            .await?;

        log::info!("Generated title: {}", title);
@@ -1037,6 +1151,7 @@ Return ONLY the comma-separated list, nothing else."#,
        &self,
        messages: &[String],
        ollama: &OllamaClient,
+        custom_system: Option<&str>,
    ) -> Option<String> {
        if messages.is_empty() {
            return None;
@@ -1054,13 +1169,10 @@ Return ONLY the summary, nothing else."#,
            messages_text
        );

-        match ollama
-            .generate(
-                &prompt,
-                Some("You are a context summarization assistant. Be concise and factual."),
-            )
-            .await
-        {
+        let system = custom_system
+            .unwrap_or("You are a context summarization assistant. Be concise and factual.");
+
+        match ollama.generate(&prompt, Some(system)).await {
            Ok(summary) => Some(summary),
            Err(e) => {
                log::warn!("Failed to summarize messages: {}", e);
@@ -1075,6 +1187,7 @@ Return ONLY the summary, nothing else."#,
        &self,
        messages: &[crate::ai::SmsMessage],
        ollama: &OllamaClient,
+        custom_system: Option<&str>,
    ) -> Option<String> {
        if messages.is_empty() {
            return None;
@@ -1111,13 +1224,11 @@ Return ONLY the summary, nothing else."#,
            messages_text
        );

-        match ollama
-            .generate(
-                &prompt,
-                Some("You are a context summarization assistant. Be detailed and factual, preserving important context."),
-            )
-            .await
-        {
+        let system = custom_system.unwrap_or(
+            "You are a context summarization assistant. Be detailed and factual, preserving important context.",
+        );
+
+        match ollama.generate(&prompt, Some(system)).await {
            Ok(summary) => Some(summary),
            Err(e) => {
                log::warn!("Failed to summarize immediate context: {}", e);
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -11,6 +11,7 @@ pub struct OllamaClient {
    pub fallback_url: Option<String>,
    pub primary_model: String,
    pub fallback_model: Option<String>,
+    num_ctx: Option<i32>,
 }

 impl OllamaClient {
@@ -30,9 +31,14 @@ impl OllamaClient {
            fallback_url,
            primary_model,
            fallback_model,
+            num_ctx: None,
        }
    }

+    pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
+        self.num_ctx = num_ctx;
+    }
+
    /// List available models on an Ollama server
    pub async fn list_models(url: &str) -> Result<Vec<String>> {
        let client = Client::builder()
@@ -79,12 +85,15 @@ impl OllamaClient {
        model: &str,
        prompt: &str,
        system: Option<&str>,
+        images: Option<Vec<String>>,
    ) -> Result<String> {
        let request = OllamaRequest {
            model: model.to_string(),
            prompt: prompt.to_string(),
            stream: false,
            system: system.map(|s| s.to_string()),
+            options: self.num_ctx.map(|ctx| OllamaOptions { num_ctx: ctx }),
+            images,
        };

        let response = self
@@ -109,12 +118,24 @@ impl OllamaClient {
    }

    pub async fn generate(&self, prompt: &str, system: Option<&str>) -> Result<String> {
+        self.generate_with_images(prompt, system, None).await
+    }
+
+    pub async fn generate_with_images(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+    ) -> Result<String> {
        log::debug!("=== Ollama Request ===");
        log::debug!("Primary model: {}", self.primary_model);
        if let Some(sys) = system {
            log::debug!("System: {}", sys);
        }
        log::debug!("Prompt:\n{}", prompt);
+        if let Some(ref imgs) = images {
+            log::debug!("Images: {} image(s) included", imgs.len());
+        }
        log::debug!("=====================");

        // Try primary server first with primary model
@@ -124,7 +145,13 @@ impl OllamaClient {
            self.primary_model
        );
        let primary_result = self
-            .try_generate(&self.primary_url, &self.primary_model, prompt, system)
+            .try_generate(
+                &self.primary_url,
+                &self.primary_model,
+                prompt,
+                system,
+                images.clone(),
+            )
            .await;

        let raw_response = match primary_result {
@@ -147,7 +174,7 @@ impl OllamaClient {
                        fallback_model
                    );
                    match self
-                        .try_generate(fallback_url, fallback_model, prompt, system)
+                        .try_generate(fallback_url, fallback_model, prompt, system, images.clone())
                        .await
                    {
                        Ok(response) => {
@@ -190,12 +217,30 @@ impl OllamaClient {
        date: NaiveDate,
        location: Option<&str>,
        sms_summary: Option<&str>,
+        custom_system: Option<&str>,
+        image_base64: Option<String>,
    ) -> Result<String> {
        let location_str = location.unwrap_or("Unknown location");
        let sms_str = sms_summary.unwrap_or("No messages");

-        let prompt = format!(
-            r#"Create a short title (maximum 8 words) about this moment:
+        let prompt = if image_base64.is_some() {
+            format!(
+                r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
+
+Date: {}
+Location: {}
+Messages: {}
+
+Analyze the image and use specific details from both the visual content and the context above. If limited information is available, use a simple descriptive title based on what you see.
+
+Return ONLY the title, nothing else."#,
+                date.format("%B %d, %Y"),
+                location_str,
+                sms_str
+            )
+        } else {
+            format!(
+                r#"Create a short title (maximum 8 words) about this moment:

 Date: {}
 Location: {}
@@ -204,14 +249,18 @@ Messages: {}
 Use specific details from the context above. If no specific details are available, use a simple descriptive title.

 Return ONLY the title, nothing else."#,
-            date.format("%B %d, %Y"),
-            location_str,
-            sms_str
-        );
+                date.format("%B %d, %Y"),
+                location_str,
+                sms_str
+            )
+        };

-        let system = "You are my long term memory assistant. Use only the information provided. Do not invent details.";
+        let system = custom_system.unwrap_or("You are my long term memory assistant. Use only the information provided. Do not invent details.");

-        let title = self.generate(&prompt, Some(system)).await?;
+        let images = image_base64.map(|img| vec![img]);
+        let title = self
+            .generate_with_images(&prompt, Some(system), images)
+            .await?;
        Ok(title.trim().trim_matches('"').to_string())
    }

@@ -221,26 +270,45 @@ Return ONLY the title, nothing else."#,
        date: NaiveDate,
        location: Option<&str>,
        sms_summary: Option<&str>,
+        custom_system: Option<&str>,
+        image_base64: Option<String>,
    ) -> Result<String> {
        let location_str = location.unwrap_or("Unknown");
        let sms_str = sms_summary.unwrap_or("No messages");

-        let prompt = format!(
-            r#"Write a 1-3 paragraph description of this moment based on the available information:
+        let prompt = if image_base64.is_some() {
+            format!(
+                r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
+
+Date: {}
+Location: {}
+Messages: {}
+
+Analyze the image and use specific details from both the visual content and the context above. Mention people's names, places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
+                date.format("%B %d, %Y"),
+                location_str,
+                sms_str
+            )
+        } else {
+            format!(
+                r#"Write a 1-3 paragraph description of this moment based on the available information:

 Date: {}
 Location: {}
 Messages: {}

 Use only the specific details provided above. Mention people's names, places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
-            date.format("%B %d, %Y"),
-            location_str,
-            sms_str
-        );
+                date.format("%B %d, %Y"),
+                location_str,
+                sms_str
+            )
+        };

-        let system = "You are a memory refreshing assistant who is able to provide insights through analyzing past conversations. Use only the information provided. Do not invent details.";
+        let system = custom_system.unwrap_or("You are a memory refreshing assistant who is able to provide insights through analyzing past conversations. Use only the information provided. Do not invent details.");

-        self.generate(&prompt, Some(system)).await
+        let images = image_base64.map(|img| vec![img]);
+        self.generate_with_images(&prompt, Some(system), images)
+            .await
    }

    /// Generate an embedding vector for text using nomic-embed-text:v1.5
@@ -388,6 +456,15 @@ struct OllamaRequest {
    stream: bool,
    #[serde(skip_serializing_if = "Option::is_none")]
    system: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    options: Option<OllamaOptions>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    images: Option<Vec<String>>,
+}
+
+#[derive(Serialize)]
+struct OllamaOptions {
+    num_ctx: i32,
 }

 #[derive(Deserialize)]
--- a/src/bin/diagnose_embeddings.rs
+++ b/src/bin/diagnose_embeddings.rs
@@ -104,7 +104,10 @@ fn main() -> Result<()> {
                ));
            }
            Err(e) => {
-                println!("Warning: Failed to parse embedding for id {}: {}", row.id, e);
+                println!(
+                    "Warning: Failed to parse embedding for id {}: {}",
+                    row.id, e
+                );
            }
        }
    }
@@ -205,11 +208,31 @@ fn main() -> Result<()> {
    let count_below_03 = all_similarities.iter().filter(|&&s| s < 0.3).count();

    println!("Similarity distribution:");
-    println!("  > 0.8:  {} ({:.1}%)", count_above_08, 100.0 * count_above_08 as f32 / all_similarities.len() as f32);
-    println!("  > 0.7:  {} ({:.1}%)", count_above_07, 100.0 * count_above_07 as f32 / all_similarities.len() as f32);
-    println!("  > 0.6:  {} ({:.1}%)", count_above_06, 100.0 * count_above_06 as f32 / all_similarities.len() as f32);
-    println!("  > 0.5:  {} ({:.1}%)", count_above_05, 100.0 * count_above_05 as f32 / all_similarities.len() as f32);
-    println!("  < 0.3:  {} ({:.1}%)", count_below_03, 100.0 * count_below_03 as f32 / all_similarities.len() as f32);
+    println!(
+        "  > 0.8:  {} ({:.1}%)",
+        count_above_08,
+        100.0 * count_above_08 as f32 / all_similarities.len() as f32
+    );
+    println!(
+        "  > 0.7:  {} ({:.1}%)",
+        count_above_07,
+        100.0 * count_above_07 as f32 / all_similarities.len() as f32
+    );
+    println!(
+        "  > 0.6:  {} ({:.1}%)",
+        count_above_06,
+        100.0 * count_above_06 as f32 / all_similarities.len() as f32
+    );
+    println!(
+        "  > 0.5:  {} ({:.1}%)",
+        count_above_05,
+        100.0 * count_above_05 as f32 / all_similarities.len() as f32
+    );
+    println!(
+        "  < 0.3:  {} ({:.1}%)",
+        count_below_03,
+        100.0 * count_below_03 as f32 / all_similarities.len() as f32
+    );
    println!();

    // Identify "central" embeddings (high average similarity to all others)
@@ -255,7 +278,9 @@ fn main() -> Result<()> {
        println!("   This explains why the same summaries always match.");
        println!();
        println!("   Possible causes:");
-        println!("   1. Summaries have similar structure/phrasing (e.g., all start with 'Summary:')");
+        println!(
+            "   1. Summaries have similar structure/phrasing (e.g., all start with 'Summary:')"
+        );
        println!("   2. Embedding model isn't capturing semantic differences well");
        println!("   3. Daily conversations have similar topics (e.g., 'good morning', plans)");
        println!();
--- a/src/bin/test_daily_summary.rs
+++ b/src/bin/test_daily_summary.rs
@@ -239,7 +239,10 @@ Keywords: [specific, unique terms]"#,
        if !args.test_mode {
            println!("\nStripping boilerplate for embedding...");
            let stripped = strip_summary_boilerplate(&summary);
-            println!("Stripped: {}...", stripped.chars().take(80).collect::<String>());
+            println!(
+                "Stripped: {}...",
+                stripped.chars().take(80).collect::<String>()
+            );

            println!("\nGenerating embedding...");
            let embedding = ollama.generate_embedding(&stripped).await?;
--- a/src/memories.rs
+++ b/src/memories.rs
@@ -66,7 +66,7 @@ impl PathExcluder {
        // Directory-based exclusions
        for excluded in &self.excluded_dirs {
            if path.starts_with(excluded) {
-                debug!(
+                trace!(
                    "PathExcluder: excluded by dir: {:?} (rule: {:?})",
                    path, excluded
                );
@@ -81,7 +81,7 @@ impl PathExcluder {
                if let Some(comp_str) = component.as_os_str().to_str()
                    && self.excluded_patterns.iter().any(|pat| pat == comp_str)
                {
-                    debug!(
+                    trace!(
                        "PathExcluder: excluded by component pattern: {:?} (component: {:?}, patterns: {:?})",
                        path, comp_str, self.excluded_patterns
                    );