feat: photo-first RAG enrichment — early vision description + tags in RAG and search context

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-18 17:23:49 -04:00
parent e58b8fe743
commit 8196ef94a0
1 changed files with 134 additions and 59 deletions
@@ -153,6 +153,7 @@ impl InsightGenerator {
        contact: Option<&str>,
        topics: Option<&[String]>,
        limit: usize,
+        extra_context: Option<&str>,
    ) -> Result<Vec<String>> {
        let tracer = global_tracer();
        let span = tracer.start_with_context("ai.rag.filter_historical", parent_cx);
@@ -174,7 +175,7 @@ impl InsightGenerator {
        }

        let query_results = self
-            .find_relevant_messages_rag(date, location, contact, topics, limit * 2)
+            .find_relevant_messages_rag(date, location, contact, topics, limit * 2, extra_context)
            .await?;

        filter_cx.span().set_attribute(KeyValue::new(
@@ -236,6 +237,7 @@ impl InsightGenerator {
        contact: Option<&str>,
        topics: Option<&[String]>,
        limit: usize,
+        extra_context: Option<&str>,
    ) -> Result<Vec<String>> {
        let tracer = global_tracer();
        let current_cx = opentelemetry::Context::current();
@@ -250,7 +252,7 @@ impl InsightGenerator {
        }

        // Build query string - prioritize topics if available (semantically meaningful)
-        let query = if let Some(topics) = topics {
+        let base_query = if let Some(topics) = topics {
            if !topics.is_empty() {
                // Use topics for semantic search - these are actual content keywords
                let topic_str = topics.join(", ");
@@ -268,6 +270,12 @@ impl InsightGenerator {
            Self::build_metadata_query(date, location, contact)
        };

+        let query = if let Some(extra) = extra_context {
+            format!("{}. {}", base_query, extra)
+        } else {
+            base_query
+        };
+
        span.set_attribute(KeyValue::new("query", query.clone()));

        // Create context with this span for child operations
@@ -718,6 +726,20 @@ impl InsightGenerator {
                .set_attribute(KeyValue::new("contact", c.clone()));
        }

+        // Fetch file tags (used to enrich RAG and final context)
+        let tag_names: Vec<String> = {
+            let mut dao = self.tag_dao.lock().expect("Unable to lock TagDao");
+            dao.get_tags_for_path(&insight_cx, &file_path)
+                .unwrap_or_else(|e| {
+                    log::warn!("Failed to fetch tags for insight {}: {}", file_path, e);
+                    Vec::new()
+                })
+                .into_iter()
+                .map(|t| t.name)
+                .collect()
+        };
+        log::info!("Fetched {} tags for photo: {:?}", tag_names.len(), tag_names);
+
        // 4. Get location name from GPS coordinates (needed for RAG query)
        let location = match exif {
            Some(ref exif) => {
@@ -744,6 +766,90 @@ impl InsightGenerator {
            None => None,
        };

+        // Check if the model has vision capabilities
+        let model_to_check = ollama_client.primary_model.clone();
+        let has_vision = match OllamaClient::check_model_capabilities(
+            &ollama_client.primary_url,
+            &model_to_check,
+        )
+        .await
+        {
+            Ok(capabilities) => {
+                log::info!(
+                    "Model '{}' vision capability: {}",
+                    model_to_check,
+                    capabilities.has_vision
+                );
+                capabilities.has_vision
+            }
+            Err(e) => {
+                log::warn!(
+                    "Failed to check vision capabilities for model '{}', assuming no vision support: {}",
+                    model_to_check,
+                    e
+                );
+                false
+            }
+        };
+
+        insight_cx
+            .span()
+            .set_attribute(KeyValue::new("model_has_vision", has_vision));
+
+        // Load image and encode as base64 only if model supports vision
+        let image_base64 = if has_vision {
+            match self.load_image_as_base64(&file_path) {
+                Ok(b64) => {
+                    log::info!(
+                        "Successfully loaded image for vision-capable model '{}'",
+                        model_to_check
+                    );
+                    Some(b64)
+                }
+                Err(e) => {
+                    log::warn!("Failed to load image for vision model: {}", e);
+                    None
+                }
+            }
+        } else {
+            log::info!(
+                "Model '{}' does not support vision, skipping image processing",
+                model_to_check
+            );
+            None
+        };
+
+        // Generate brief photo description for RAG enrichment (vision models only)
+        let photo_description: Option<String> = if let Some(ref img_b64) = image_base64 {
+            match ollama_client.generate_photo_description(img_b64).await {
+                Ok(desc) => {
+                    log::info!("Photo description for RAG enrichment: {}", desc);
+                    Some(desc)
+                }
+                Err(e) => {
+                    log::warn!("Failed to generate photo description for RAG enrichment: {}", e);
+                    None
+                }
+            }
+        } else {
+            None
+        };
+
+        // Build enriched context string for RAG: photo description + tags
+        // (SMS topics are passed separately to RAG functions)
+        let enriched_query: Option<String> = {
+            let mut parts: Vec<String> = Vec::new();
+            if let Some(ref desc) = photo_description {
+                parts.push(desc.clone());
+            }
+            if !tag_names.is_empty() {
+                parts.push(format!("tags: {}", tag_names.join(", ")));
+            }
+            if parts.is_empty() { None } else { Some(parts.join(". ")) }
+        };
+
+        let mut search_enrichment: Option<String> = enriched_query.clone();
+
        // 5. Intelligent retrieval: Hybrid approach for better context
        let mut sms_summary = None;
        let mut used_rag = false;
@@ -782,6 +888,21 @@ impl InsightGenerator {

                log::info!("Extracted topics for query enrichment: {:?}", topics);

+                // Build full search enrichment: SMS topics + photo description + tag names
+                search_enrichment = {
+                    let mut parts: Vec<String> = Vec::new();
+                    if !topics.is_empty() {
+                        parts.push(topics.join(", "));
+                    }
+                    if let Some(ref desc) = photo_description {
+                        parts.push(desc.clone());
+                    }
+                    if !tag_names.is_empty() {
+                        parts.push(format!("tags: {}", tag_names.join(", ")));
+                    }
+                    if parts.is_empty() { None } else { Some(parts.join(". ")) }
+                };
+
                // Step 3: Try historical RAG (>30 days ago) using extracted topics
                let topics_slice = if topics.is_empty() {
                    None
@@ -796,6 +917,7 @@ impl InsightGenerator {
                        contact.as_deref(),
                        topics_slice,
                        10, // Top 10 historical matches
+                        enriched_query.as_deref(),
                    )
                    .await
                {
@@ -858,7 +980,7 @@ impl InsightGenerator {
                log::info!("No immediate messages found, trying basic RAG as fallback");
                // Fallback to basic RAG even without strong query
                match self
-                    .find_relevant_messages_rag(date_taken, None, contact.as_deref(), None, 20)
+                    .find_relevant_messages_rag(date_taken, None, contact.as_deref(), None, 20, enriched_query.as_deref())
                    .await
                {
                    Ok(rag_messages) if !rag_messages.is_empty() => {
@@ -955,19 +1077,25 @@ impl InsightGenerator {
                timestamp,
                location.as_deref(),
                contact.as_deref(),
-                None, // enrichment — wired up in Task 5
+                search_enrichment.as_deref(),
            )
            .await
            .ok()
            .flatten();

        // 7. Combine all context sources with equal weight
+        let tags_context = if tag_names.is_empty() {
+            None
+        } else {
+            Some(tag_names.join(", "))
+        };
+
        let combined_context = Self::combine_contexts(
            sms_summary,
            calendar_context,
            location_context,
            search_context,
-            None, // tags — wired up in Task 5
+            tags_context,
        );

        log::info!(
@@ -975,59 +1103,6 @@ impl InsightGenerator {
            combined_context.len()
        );

-        // 8. Check if the model has vision capabilities
-        let model_to_check = ollama_client.primary_model.clone();
-        let has_vision = match OllamaClient::check_model_capabilities(
-            &ollama_client.primary_url,
-            &model_to_check,
-        )
-        .await
-        {
-            Ok(capabilities) => {
-                log::info!(
-                    "Model '{}' vision capability: {}",
-                    model_to_check,
-                    capabilities.has_vision
-                );
-                capabilities.has_vision
-            }
-            Err(e) => {
-                log::warn!(
-                    "Failed to check vision capabilities for model '{}', assuming no vision support: {}",
-                    model_to_check,
-                    e
-                );
-                false
-            }
-        };
-
-        insight_cx
-            .span()
-            .set_attribute(KeyValue::new("model_has_vision", has_vision));
-
-        // 9. Load image and encode as base64 only if model supports vision
-        let image_base64 = if has_vision {
-            match self.load_image_as_base64(&file_path) {
-                Ok(b64) => {
-                    log::info!(
-                        "Successfully loaded image for vision-capable model '{}'",
-                        model_to_check
-                    );
-                    Some(b64)
-                }
-                Err(e) => {
-                    log::warn!("Failed to load image for vision model: {}", e);
-                    None
-                }
-            }
-        } else {
-            log::info!(
-                "Model '{}' does not support vision, skipping image processing",
-                model_to_check
-            );
-            None
-        };
-
        // 10. Generate summary first, then derive title from the summary
        let summary = ollama_client
            .generate_photo_summary(
@@ -1036,7 +1111,7 @@ impl InsightGenerator {
                contact.as_deref(),
                Some(&combined_context),
                custom_system_prompt.as_deref(),
-                image_base64,
+                image_base64.clone(),
            )
            .await?;