Add check for vision capabilities

2026-01-11 15:22:24 -05:00
parent 5b35df4007
commit ad0bba63b4
4 changed files with 235 additions and 42 deletions
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -3,7 +3,7 @@ use opentelemetry::KeyValue;
 use opentelemetry::trace::{Span, Status, Tracer};
 use serde::{Deserialize, Serialize};

-use crate::ai::{InsightGenerator, OllamaClient};
+use crate::ai::{InsightGenerator, ModelCapabilities, OllamaClient};
 use crate::data::Claims;
 use crate::database::InsightDao;
 use crate::otel::{extract_context_from_request, global_tracer};
@@ -45,7 +45,7 @@ pub struct AvailableModelsResponse {
 #[derive(Debug, Serialize)]
 pub struct ServerModels {
    pub url: String,
-    pub models: Vec<String>,
+    pub models: Vec<ModelCapabilities>,
    pub default_model: String,
 }

@@ -211,18 +211,18 @@ pub async fn get_all_insights_handler(
    }
 }

-/// GET /insights/models - List available models from both servers
+/// GET /insights/models - List available models from both servers with capabilities
 #[get("/insights/models")]
 pub async fn get_available_models_handler(
    _claims: Claims,
    app_state: web::Data<crate::state::AppState>,
 ) -> impl Responder {
-    log::debug!("Fetching available models");
+    log::debug!("Fetching available models with capabilities");

    let ollama_client = &app_state.ollama;

-    // Fetch models from primary server
-    let primary_models = match OllamaClient::list_models(&ollama_client.primary_url).await {
+    // Fetch models with capabilities from primary server
+    let primary_models = match OllamaClient::list_models_with_capabilities(&ollama_client.primary_url).await {
        Ok(models) => models,
        Err(e) => {
            log::warn!("Failed to fetch models from primary server: {:?}", e);
@@ -236,9 +236,9 @@ pub async fn get_available_models_handler(
        default_model: ollama_client.primary_model.clone(),
    };

-    // Fetch models from fallback server if configured
+    // Fetch models with capabilities from fallback server if configured
    let fallback = if let Some(fallback_url) = &ollama_client.fallback_url {
-        match OllamaClient::list_models(fallback_url).await {
+        match OllamaClient::list_models_with_capabilities(fallback_url).await {
            Ok(models) => Some(ServerModels {
                url: fallback_url.clone(),
                models,
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -961,23 +961,62 @@ impl InsightGenerator {
            combined_context.len()
        );

-        // 8. Load image and encode as base64 for vision models
-        let image_base64 = match self.load_image_as_base64(&file_path) {
+        // 8. Check if the model has vision capabilities
+        let model_to_check = ollama_client.primary_model.clone();
+        let has_vision = match OllamaClient::check_model_capabilities(
+            &ollama_client.primary_url,
+            &model_to_check,
+        )
+        .await
+        {
+            Ok(capabilities) => {
+                log::info!(
+                    "Model '{}' vision capability: {}",
+                    model_to_check,
+                    capabilities.has_vision
+                );
+                capabilities.has_vision
+            }
+            Err(e) => {
+                log::warn!(
+                    "Failed to check vision capabilities for model '{}', assuming no vision support: {}",
+                    model_to_check,
+                    e
+                );
+                false
+            }
+        };
+
+        insight_cx
+            .span()
+            .set_attribute(KeyValue::new("model_has_vision", has_vision));
+
+        // 9. Load image and encode as base64 only if model supports vision
+        let image_base64 = if has_vision {
+            match self.load_image_as_base64(&file_path) {
                Ok(b64) => {
-                log::info!("Successfully loaded image for vision model");
+                    log::info!("Successfully loaded image for vision-capable model '{}'", model_to_check);
                    Some(b64)
                }
                Err(e) => {
                    log::warn!("Failed to load image for vision model: {}", e);
                    None
                }
+            }
+        } else {
+            log::info!(
+                "Model '{}' does not support vision, skipping image processing",
+                model_to_check
+            );
+            None
        };

-        // 9. Generate title and summary with Ollama (using multi-source context + image)
+        // 10. Generate title and summary with Ollama (using multi-source context + image if supported)
        let title = ollama_client
            .generate_photo_title(
                date_taken,
                location.as_deref(),
+                contact.as_deref(),
                Some(&combined_context),
                custom_system_prompt.as_deref(),
                image_base64.clone(),
@@ -988,6 +1027,7 @@ impl InsightGenerator {
            .generate_photo_summary(
                date_taken,
                location.as_deref(),
+                contact.as_deref(),
                Some(&combined_context),
                custom_system_prompt.as_deref(),
                image_base64,
@@ -1004,7 +1044,7 @@ impl InsightGenerator {
            .span()
            .set_attribute(KeyValue::new("summary_length", summary.len() as i64));

-        // 9. Store in database
+        // 11. Store in database
        let insight = InsertPhotoInsight {
            file_path: file_path.to_string(),
            title,
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -11,5 +11,5 @@ pub use handlers::{
    get_available_models_handler, get_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
-pub use ollama::OllamaClient;
+pub use ollama::{ModelCapabilities, OllamaClient};
 pub use sms_client::{SmsApiClient, SmsMessage};
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -62,6 +62,67 @@ impl OllamaClient {
        Ok(models.iter().any(|m| m == model_name))
    }

+    /// Check if a model has vision capabilities using the /api/show endpoint
+    pub async fn check_model_capabilities(url: &str, model_name: &str) -> Result<ModelCapabilities> {
+        let client = Client::builder()
+            .connect_timeout(Duration::from_secs(5))
+            .timeout(Duration::from_secs(10))
+            .build()?;
+
+        #[derive(Serialize)]
+        struct ShowRequest {
+            model: String,
+        }
+
+        let response = client
+            .post(&format!("{}/api/show", url))
+            .json(&ShowRequest {
+                model: model_name.to_string(),
+            })
+            .send()
+            .await?;
+
+        if !response.status().is_success() {
+            return Err(anyhow::anyhow!(
+                "Failed to get model details for {} from {}",
+                model_name,
+                url
+            ));
+        }
+
+        let show_response: OllamaShowResponse = response.json().await?;
+
+        // Check if "vision" is in the capabilities array
+        let has_vision = show_response.capabilities.iter().any(|cap| cap == "vision");
+
+        Ok(ModelCapabilities {
+            name: model_name.to_string(),
+            has_vision,
+        })
+    }
+
+    /// List all models with their capabilities from a server
+    pub async fn list_models_with_capabilities(url: &str) -> Result<Vec<ModelCapabilities>> {
+        let models = Self::list_models(url).await?;
+        let mut capabilities = Vec::new();
+
+        for model_name in models {
+            match Self::check_model_capabilities(url, &model_name).await {
+                Ok(cap) => capabilities.push(cap),
+                Err(e) => {
+                    log::warn!("Failed to get capabilities for model {}: {}", model_name, e);
+                    // Fallback: assume no vision if we can't check
+                    capabilities.push(ModelCapabilities {
+                        name: model_name,
+                        has_vision: false,
+                    });
+                }
+            }
+        }
+
+        Ok(capabilities)
+    }
+
    /// Extract final answer from thinking model output
    /// Handles <think>...</think> tags and takes everything after
    fn extract_final_answer(&self, response: &str) -> String {
@@ -216,6 +277,7 @@ impl OllamaClient {
        &self,
        date: NaiveDate,
        location: Option<&str>,
+        contact: Option<&str>,
        sms_summary: Option<&str>,
        custom_system: Option<&str>,
        image_base64: Option<String>,
@@ -224,6 +286,25 @@ impl OllamaClient {
        let sms_str = sms_summary.unwrap_or("No messages");

        let prompt = if image_base64.is_some() {
+            if let Some(contact_name) = contact {
+                format!(
+                    r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
+
+Date: {}
+Location: {}
+Person/Contact: {}
+Messages: {}
+
+Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. If limited information is available, use a simple descriptive title based on what you see.
+
+Return ONLY the title, nothing else."#,
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    contact_name,
+                    sms_str,
+                    contact_name
+                )
+            } else {
                format!(
                    r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:

@@ -238,6 +319,26 @@ Return ONLY the title, nothing else."#,
                    location_str,
                    sms_str
                )
+            }
+        } else {
+            if let Some(contact_name) = contact {
+                format!(
+                    r#"Create a short title (maximum 8 words) about this moment:
+
+Date: {}
+Location: {}
+Person/Contact: {}
+Messages: {}
+
+Use specific details from the context above. The photo is from a folder for {}, so they are likely related to this moment. If no specific details are available, use a simple descriptive title.
+
+Return ONLY the title, nothing else."#,
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    contact_name,
+                    sms_str,
+                    contact_name
+                )
            } else {
                format!(
                    r#"Create a short title (maximum 8 words) about this moment:
@@ -253,6 +354,7 @@ Return ONLY the title, nothing else."#,
                    location_str,
                    sms_str
                )
+            }
        };

        let system = custom_system.unwrap_or("You are my long term memory assistant. Use only the information provided. Do not invent details.");
@@ -269,6 +371,7 @@ Return ONLY the title, nothing else."#,
        &self,
        date: NaiveDate,
        location: Option<&str>,
+        contact: Option<&str>,
        sms_summary: Option<&str>,
        custom_system: Option<&str>,
        image_base64: Option<String>,
@@ -277,6 +380,24 @@ Return ONLY the title, nothing else."#,
        let sms_str = sms_summary.unwrap_or("No messages");

        let prompt = if image_base64.is_some() {
+            if let Some(contact_name) = contact {
+                format!(
+                    r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
+
+Date: {}
+Location: {}
+Person/Contact: {}
+Messages: {}
+
+Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. Mention people's names (especially {}), places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    contact_name,
+                    sms_str,
+                    contact_name,
+                    contact_name
+                )
+            } else {
                format!(
                    r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:

@@ -289,6 +410,25 @@ Analyze the image and use specific details from both the visual content and the
                    location_str,
                    sms_str
                )
+            }
+        } else {
+            if let Some(contact_name) = contact {
+                format!(
+                    r#"Write a 1-3 paragraph description of this moment based on the available information:
+
+Date: {}
+Location: {}
+Person/Contact: {}
+Messages: {}
+
+Use only the specific details provided above. The photo is from a folder for {}, so they are likely related to this moment. Mention people's names (especially {}), places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
+                    date.format("%B %d, %Y"),
+                    location_str,
+                    contact_name,
+                    sms_str,
+                    contact_name,
+                    contact_name
+                )
            } else {
                format!(
                    r#"Write a 1-3 paragraph description of this moment based on the available information:
@@ -302,6 +442,7 @@ Use only the specific details provided above. Mention people's names, places, or
                    location_str,
                    sms_str
                )
+            }
        };

        let system = custom_system.unwrap_or("You are a memory refreshing assistant who is able to provide insights through analyzing past conversations. Use only the information provided. Do not invent details.");
@@ -482,6 +623,18 @@ struct OllamaModel {
    name: String,
 }

+#[derive(Deserialize)]
+struct OllamaShowResponse {
+    #[serde(default)]
+    capabilities: Vec<String>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct ModelCapabilities {
+    pub name: String,
+    pub has_vision: bool,
+}
+
 #[derive(Serialize)]
 struct OllamaEmbedRequest {
    model: String,