Add check for vision capabilities

2026-01-11 15:22:24 -05:00
parent 5b35df4007
commit ad0bba63b4
4 changed files with 235 additions and 42 deletions
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -3,7 +3,7 @@ use opentelemetry::KeyValue;
 use opentelemetry::trace::{Span, Status, Tracer};
 use serde::{Deserialize, Serialize};

-use crate::ai::{InsightGenerator, OllamaClient};
+use crate::ai::{InsightGenerator, ModelCapabilities, OllamaClient};
 use crate::data::Claims;
 use crate::database::InsightDao;
 use crate::otel::{extract_context_from_request, global_tracer};
@@ -45,7 +45,7 @@ pub struct AvailableModelsResponse {
 #[derive(Debug, Serialize)]
 pub struct ServerModels {
    pub url: String,
-    pub models: Vec<String>,
+    pub models: Vec<ModelCapabilities>,
    pub default_model: String,
 }

@@ -211,18 +211,18 @@ pub async fn get_all_insights_handler(
    }
 }

-/// GET /insights/models - List available models from both servers
+/// GET /insights/models - List available models from both servers with capabilities
 #[get("/insights/models")]
 pub async fn get_available_models_handler(
    _claims: Claims,
    app_state: web::Data<crate::state::AppState>,
 ) -> impl Responder {
-    log::debug!("Fetching available models");
+    log::debug!("Fetching available models with capabilities");

    let ollama_client = &app_state.ollama;

-    // Fetch models from primary server
-    let primary_models = match OllamaClient::list_models(&ollama_client.primary_url).await {
+    // Fetch models with capabilities from primary server
+    let primary_models = match OllamaClient::list_models_with_capabilities(&ollama_client.primary_url).await {
        Ok(models) => models,
        Err(e) => {
            log::warn!("Failed to fetch models from primary server: {:?}", e);
@@ -236,9 +236,9 @@ pub async fn get_available_models_handler(
        default_model: ollama_client.primary_model.clone(),
    };

-    // Fetch models from fallback server if configured
+    // Fetch models with capabilities from fallback server if configured
    let fallback = if let Some(fallback_url) = &ollama_client.fallback_url {
-        match OllamaClient::list_models(fallback_url).await {
+        match OllamaClient::list_models_with_capabilities(fallback_url).await {
            Ok(models) => Some(ServerModels {
                url: fallback_url.clone(),
                models,
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -961,23 +961,62 @@ impl InsightGenerator {
            combined_context.len()
        );

-        // 8. Load image and encode as base64 for vision models
-        let image_base64 = match self.load_image_as_base64(&file_path) {
-            Ok(b64) => {
-                log::info!("Successfully loaded image for vision model");
-                Some(b64)
+        // 8. Check if the model has vision capabilities
+        let model_to_check = ollama_client.primary_model.clone();
+        let has_vision = match OllamaClient::check_model_capabilities(
+            &ollama_client.primary_url,
+            &model_to_check,
+        )
+        .await
+        {
+            Ok(capabilities) => {
+                log::info!(
+                    "Model '{}' vision capability: {}",
+                    model_to_check,
+                    capabilities.has_vision
+                );
+                capabilities.has_vision
            }
            Err(e) => {
-                log::warn!("Failed to load image for vision model: {}", e);
-                None
+                log::warn!(
+                    "Failed to check vision capabilities for model '{}', assuming no vision support: {}",
+                    model_to_check,
+                    e
+                );
+                false
            }
        };

-        // 9. Generate title and summary with Ollama (using multi-source context + image)
+        insight_cx
+            .span()
+            .set_attribute(KeyValue::new("model_has_vision", has_vision));
+
+        // 9. Load image and encode as base64 only if model supports vision
+        let image_base64 = if has_vision {
+            match self.load_image_as_base64(&file_path) {
+                Ok(b64) => {
+                    log::info!("Successfully loaded image for vision-capable model '{}'", model_to_check);
+                    Some(b64)
+                }
+                Err(e) => {
+                    log::warn!("Failed to load image for vision model: {}", e);
+                    None
+                }
+            }
+        } else {
+            log::info!(
+                "Model '{}' does not support vision, skipping image processing",
+                model_to_check
+            );
+            None
+        };
+
+        // 10. Generate title and summary with Ollama (using multi-source context + image if supported)
        let title = ollama_client
            .generate_photo_title(
                date_taken,
                location.as_deref(),
+                contact.as_deref(),
                Some(&combined_context),
                custom_system_prompt.as_deref(),
                image_base64.clone(),
@@ -988,6 +1027,7 @@ impl InsightGenerator {
            .generate_photo_summary(
                date_taken,
                location.as_deref(),
+                contact.as_deref(),
                Some(&combined_context),
                custom_system_prompt.as_deref(),
                image_base64,
@@ -1004,7 +1044,7 @@ impl InsightGenerator {
            .span()
            .set_attribute(KeyValue::new("summary_length", summary.len() as i64));

-        // 9. Store in database
+        // 11. Store in database
        let insight = InsertPhotoInsight {
            file_path: file_path.to_string(),
            title,
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -11,5 +11,5 @@ pub use handlers::{
    get_available_models_handler, get_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
-pub use ollama::OllamaClient;
+pub use ollama::{ModelCapabilities, OllamaClient};
 pub use sms_client::{SmsApiClient, SmsMessage};
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -62,6 +62,67 @@ impl OllamaClient {
        Ok(models.iter().any(|m| m == model_name))
    }

+    /// Check if a model has vision capabilities using the /api/show endpoint
+    pub async fn check_model_capabilities(url: &str, model_name: &str) -> Result<ModelCapabilities> {
+        let client = Client::builder()
+            .connect_timeout(Duration::from_secs(5))
+            .timeout(Duration::from_secs(10))
+            .build()?;
+
+        #[derive(Serialize)]
+        struct ShowRequest {
+            model: String,
+        }
+
+        let response = client
+            .post(&format!("{}/api/show", url))
+            .json(&ShowRequest {
+                model: model_name.to_string(),
+            })
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            return Err(anyhow::anyhow!(
+                "Failed to get model details for {} from {}",
+                model_name,
+                url
+            ));
+        }
+
+        let show_response: OllamaShowResponse = response.json().await?;
+
+        // Check if "vision" is in the capabilities array
+        let has_vision = show_response.capabilities.iter().any(|cap| cap == "vision");
+
+        Ok(ModelCapabilities {
+            name: model_name.to_string(),
+            has_vision,
+        })
+    }
+
+    /// List all models with their capabilities from a server
+    pub async fn list_models_with_capabilities(url: &str) -> Result<Vec<ModelCapabilities>> {
+        let models = Self::list_models(url).await?;
+        let mut capabilities = Vec::new();
+
+        for model_name in models {
+            match Self::check_model_capabilities(url, &model_name).await {
+                Ok(cap) => capabilities.push(cap),
+                Err(e) => {
+                    log::warn!("Failed to get capabilities for model {}: {}", model_name, e);
+                    // Fallback: assume no vision if we can't check
+                    capabilities.push(ModelCapabilities {
+                        name: model_name,
+                        has_vision: false,
+                    });
+                }
+            }
+        }
+
+        Ok(capabilities)
+    }
+
    /// Extract final answer from thinking model output
    /// Handles <think>...</think> tags and takes everything after
    fn extract_final_answer(&self, response: &str) -> String {
@@ -216,6 +277,7 @@ impl OllamaClient {
        &self,
        date: NaiveDate,
        location: Option<&str>,
+        contact: Option<&str>,
        sms_summary: Option<&str>,
        custom_system: Option<&str>,
        image_base64: Option<String>,
@@ -224,8 +286,27 @@ impl OllamaClient {
        let sms_str = sms_summary.unwrap_or("No messages");

        let prompt = if image_base64.is_some() {
-            format!(
-                r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
+            if let Some(contact_name) = contact {
+                format!(
+                    r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
+
+Date: {}
+Location: {}
+Person/Contact: {}
+Messages: {}
+
+Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. If limited information is available, use a simple descriptive title based on what you see.
+
+Return ONLY the title, nothing else."#,
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    contact_name,
+                    sms_str,
+                    contact_name
+                )
+            } else {
+                format!(
+                    r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:

 Date: {}
 Location: {}
@@ -234,13 +315,33 @@ Messages: {}
 Analyze the image and use specific details from both the visual content and the context above. If limited information is available, use a simple descriptive title based on what you see.

 Return ONLY the title, nothing else."#,
-                date.format("%B %d, %Y"),
-                location_str,
-                sms_str
-            )
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    sms_str
+                )
+            }
        } else {
-            format!(
-                r#"Create a short title (maximum 8 words) about this moment:
+            if let Some(contact_name) = contact {
+                format!(
+                    r#"Create a short title (maximum 8 words) about this moment:
+
+Date: {}
+Location: {}
+Person/Contact: {}
+Messages: {}
+
+Use specific details from the context above. The photo is from a folder for {}, so they are likely related to this moment. If no specific details are available, use a simple descriptive title.
+
+Return ONLY the title, nothing else."#,
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    contact_name,
+                    sms_str,
+                    contact_name
+                )
+            } else {
+                format!(
+                    r#"Create a short title (maximum 8 words) about this moment:

 Date: {}
 Location: {}
@@ -249,10 +350,11 @@ Messages: {}
 Use specific details from the context above. If no specific details are available, use a simple descriptive title.

 Return ONLY the title, nothing else."#,
-                date.format("%B %d, %Y"),
-                location_str,
-                sms_str
-            )
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    sms_str
+                )
+            }
        };

        let system = custom_system.unwrap_or("You are my long term memory assistant. Use only the information provided. Do not invent details.");
@@ -269,6 +371,7 @@ Return ONLY the title, nothing else."#,
        &self,
        date: NaiveDate,
        location: Option<&str>,
+        contact: Option<&str>,
        sms_summary: Option<&str>,
        custom_system: Option<&str>,
        image_base64: Option<String>,
@@ -277,31 +380,69 @@ Return ONLY the title, nothing else."#,
        let sms_str = sms_summary.unwrap_or("No messages");

        let prompt = if image_base64.is_some() {
-            format!(
-                r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
+            if let Some(contact_name) = contact {
+                format!(
+                    r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
+
+Date: {}
+Location: {}
+Person/Contact: {}
+Messages: {}
+
+Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. Mention people's names (especially {}), places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    contact_name,
+                    sms_str,
+                    contact_name,
+                    contact_name
+                )
+            } else {
+                format!(
+                    r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:

 Date: {}
 Location: {}
 Messages: {}

 Analyze the image and use specific details from both the visual content and the context above. Mention people's names, places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
-                date.format("%B %d, %Y"),
-                location_str,
-                sms_str
-            )
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    sms_str
+                )
+            }
        } else {
-            format!(
-                r#"Write a 1-3 paragraph description of this moment based on the available information:
+            if let Some(contact_name) = contact {
+                format!(
+                    r#"Write a 1-3 paragraph description of this moment based on the available information:
+
+Date: {}
+Location: {}
+Person/Contact: {}
+Messages: {}
+
+Use only the specific details provided above. The photo is from a folder for {}, so they are likely related to this moment. Mention people's names (especially {}), places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    contact_name,
+                    sms_str,
+                    contact_name,
+                    contact_name
+                )
+            } else {
+                format!(
+                    r#"Write a 1-3 paragraph description of this moment based on the available information:

 Date: {}
 Location: {}
 Messages: {}

 Use only the specific details provided above. Mention people's names, places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
-                date.format("%B %d, %Y"),
-                location_str,
-                sms_str
-            )
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    sms_str
+                )
+            }
        };

        let system = custom_system.unwrap_or("You are a memory refreshing assistant who is able to provide insights through analyzing past conversations. Use only the information provided. Do not invent details.");
@@ -482,6 +623,18 @@ struct OllamaModel {
    name: String,
 }

+#[derive(Deserialize)]
+struct OllamaShowResponse {
+    #[serde(default)]
+    capabilities: Vec<String>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct ModelCapabilities {
+    pub name: String,
+    pub has_vision: bool,
+}
+
 #[derive(Serialize)]
 struct OllamaEmbedRequest {
    model: String,