Add check for vision capabilities

2026-01-11 15:22:24 -05:00
parent 5b35df4007
commit ad0bba63b4
4 changed files with 235 additions and 42 deletions
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -3,7 +3,7 @@ use opentelemetry::KeyValue;
 use opentelemetry::trace::{Span, Status, Tracer};
 use serde::{Deserialize, Serialize};
-use crate::ai::{InsightGenerator, OllamaClient};
+use crate::ai::{InsightGenerator, ModelCapabilities, OllamaClient};
 use crate::data::Claims;
 use crate::database::InsightDao;
 use crate::otel::{extract_context_from_request, global_tracer};
@@ -45,7 +45,7 @@ pub struct AvailableModelsResponse {
 #[derive(Debug, Serialize)]
 pub struct ServerModels {
    pub url: String,
-    pub models: Vec<String>,
+    pub models: Vec<ModelCapabilities>,
    pub default_model: String,
 }
@@ -211,18 +211,18 @@ pub async fn get_all_insights_handler(
    }
 }
-/// GET /insights/models - List available models from both servers
+/// GET /insights/models - List available models from both servers with capabilities
 #[get("/insights/models")]
 pub async fn get_available_models_handler(
    _claims: Claims,
    app_state: web::Data<crate::state::AppState>,
 ) -> impl Responder {
-    log::debug!("Fetching available models");
+    log::debug!("Fetching available models with capabilities");
    let ollama_client = &app_state.ollama;
-    // Fetch models from primary server
+    // Fetch models with capabilities from primary server
-    let primary_models = match OllamaClient::list_models(&ollama_client.primary_url).await {
+    let primary_models = match OllamaClient::list_models_with_capabilities(&ollama_client.primary_url).await {
        Ok(models) => models,
        Err(e) => {
            log::warn!("Failed to fetch models from primary server: {:?}", e);
@@ -236,9 +236,9 @@ pub async fn get_available_models_handler(
        default_model: ollama_client.primary_model.clone(),
    };
-    // Fetch models from fallback server if configured
+    // Fetch models with capabilities from fallback server if configured
    let fallback = if let Some(fallback_url) = &ollama_client.fallback_url {
-        match OllamaClient::list_models(fallback_url).await {
+        match OllamaClient::list_models_with_capabilities(fallback_url).await {
            Ok(models) => Some(ServerModels {
                url: fallback_url.clone(),
                models,
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -961,23 +961,62 @@ impl InsightGenerator {
            combined_context.len()
        );
-        // 8. Load image and encode as base64 for vision models
+        // 8. Check if the model has vision capabilities
-        let image_base64 = match self.load_image_as_base64(&file_path) {
+        let model_to_check = ollama_client.primary_model.clone();
        let has_vision = match OllamaClient::check_model_capabilities(
            &ollama_client.primary_url,
            &model_to_check,
        )
        .await
        {
            Ok(capabilities) => {
                log::info!(
                    "Model '{}' vision capability: {}",
                    model_to_check,
                    capabilities.has_vision
                );
                capabilities.has_vision
            }
            Err(e) => {
                log::warn!(
                    "Failed to check vision capabilities for model '{}', assuming no vision support: {}",
                    model_to_check,
                    e
                );
                false
            }
        };
        insight_cx
            .span()
            .set_attribute(KeyValue::new("model_has_vision", has_vision));
        // 9. Load image and encode as base64 only if model supports vision
        let image_base64 = if has_vision {
            match self.load_image_as_base64(&file_path) {
                Ok(b64) => {
-                log::info!("Successfully loaded image for vision model");
+                    log::info!("Successfully loaded image for vision-capable model '{}'", model_to_check);
                    Some(b64)
                }
                Err(e) => {
                    log::warn!("Failed to load image for vision model: {}", e);
                    None
                }
            }
        } else {
            log::info!(
                "Model '{}' does not support vision, skipping image processing",
                model_to_check
            );
            None
        };
-        // 9. Generate title and summary with Ollama (using multi-source context + image)
+        // 10. Generate title and summary with Ollama (using multi-source context + image if supported)
        let title = ollama_client
            .generate_photo_title(
                date_taken,
                location.as_deref(),
                contact.as_deref(),
                Some(&combined_context),
                custom_system_prompt.as_deref(),
                image_base64.clone(),
@@ -988,6 +1027,7 @@ impl InsightGenerator {
            .generate_photo_summary(
                date_taken,
                location.as_deref(),
                contact.as_deref(),
                Some(&combined_context),
                custom_system_prompt.as_deref(),
                image_base64,
@@ -1004,7 +1044,7 @@ impl InsightGenerator {
            .span()
            .set_attribute(KeyValue::new("summary_length", summary.len() as i64));
-        // 9. Store in database
+        // 11. Store in database
        let insight = InsertPhotoInsight {
            file_path: file_path.to_string(),
            title,
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -11,5 +11,5 @@ pub use handlers::{
    get_available_models_handler, get_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
-pub use ollama::OllamaClient;
+pub use ollama::{ModelCapabilities, OllamaClient};
 pub use sms_client::{SmsApiClient, SmsMessage};
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -62,6 +62,67 @@ impl OllamaClient {
        Ok(models.iter().any(|m| m == model_name))
    }
    /// Check if a model has vision capabilities using the /api/show endpoint
    pub async fn check_model_capabilities(url: &str, model_name: &str) -> Result<ModelCapabilities> {
        let client = Client::builder()
            .connect_timeout(Duration::from_secs(5))
            .timeout(Duration::from_secs(10))
            .build()?;
        #[derive(Serialize)]
        struct ShowRequest {
            model: String,
        }
        let response = client
            .post(&format!("{}/api/show", url))
            .json(&ShowRequest {
                model: model_name.to_string(),
            })
            .send()
            .await?;
        if !response.status().is_success() {
            return Err(anyhow::anyhow!(
                "Failed to get model details for {} from {}",
                model_name,
                url
            ));
        }
        let show_response: OllamaShowResponse = response.json().await?;
        // Check if "vision" is in the capabilities array
        let has_vision = show_response.capabilities.iter().any(|cap| cap == "vision");
        Ok(ModelCapabilities {
            name: model_name.to_string(),
            has_vision,
        })
    }
    /// List all models with their capabilities from a server
    pub async fn list_models_with_capabilities(url: &str) -> Result<Vec<ModelCapabilities>> {
        let models = Self::list_models(url).await?;
        let mut capabilities = Vec::new();
        for model_name in models {
            match Self::check_model_capabilities(url, &model_name).await {
                Ok(cap) => capabilities.push(cap),
                Err(e) => {
                    log::warn!("Failed to get capabilities for model {}: {}", model_name, e);
                    // Fallback: assume no vision if we can't check
                    capabilities.push(ModelCapabilities {
                        name: model_name,
                        has_vision: false,
                    });
                }
            }
        }
        Ok(capabilities)
    }
    /// Extract final answer from thinking model output
    /// Handles <think>...</think> tags and takes everything after
    fn extract_final_answer(&self, response: &str) -> String {
@@ -216,6 +277,7 @@ impl OllamaClient {
        &self,
        date: NaiveDate,
        location: Option<&str>,
        contact: Option<&str>,
        sms_summary: Option<&str>,
        custom_system: Option<&str>,
        image_base64: Option<String>,
@@ -224,6 +286,25 @@ impl OllamaClient {
        let sms_str = sms_summary.unwrap_or("No messages");
        let prompt = if image_base64.is_some() {
            if let Some(contact_name) = contact {
                format!(
                    r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
 Date: {}
 Location: {}
 Person/Contact: {}
 Messages: {}
 Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. If limited information is available, use a simple descriptive title based on what you see.
 Return ONLY the title, nothing else."#,
                    date.format("%B %d, %Y"),
                    location_str,
                    contact_name,
                    sms_str,
                    contact_name
                )
            } else {
                format!(
                    r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
@@ -238,6 +319,26 @@ Return ONLY the title, nothing else."#,
                    location_str,
                    sms_str
                )
            }
        } else {
            if let Some(contact_name) = contact {
                format!(
                    r#"Create a short title (maximum 8 words) about this moment:
 Date: {}
 Location: {}
 Person/Contact: {}
 Messages: {}
 Use specific details from the context above. The photo is from a folder for {}, so they are likely related to this moment. If no specific details are available, use a simple descriptive title.
 Return ONLY the title, nothing else."#,
                    date.format("%B %d, %Y"),
                    location_str,
                    contact_name,
                    sms_str,
                    contact_name
                )
            } else {
                format!(
                    r#"Create a short title (maximum 8 words) about this moment:
@@ -253,6 +354,7 @@ Return ONLY the title, nothing else."#,
                    location_str,
                    sms_str
                )
            }
        };
        let system = custom_system.unwrap_or("You are my long term memory assistant. Use only the information provided. Do not invent details.");
@@ -269,6 +371,7 @@ Return ONLY the title, nothing else."#,
        &self,
        date: NaiveDate,
        location: Option<&str>,
        contact: Option<&str>,
        sms_summary: Option<&str>,
        custom_system: Option<&str>,
        image_base64: Option<String>,
@@ -277,6 +380,24 @@ Return ONLY the title, nothing else."#,
        let sms_str = sms_summary.unwrap_or("No messages");
        let prompt = if image_base64.is_some() {
            if let Some(contact_name) = contact {
                format!(
                    r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
 Date: {}
 Location: {}
 Person/Contact: {}
 Messages: {}
 Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. Mention people's names (especially {}), places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
                    date.format("%B %d, %Y"),
                    location_str,
                    contact_name,
                    sms_str,
                    contact_name,
                    contact_name
                )
            } else {
                format!(
                    r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
@@ -289,6 +410,25 @@ Analyze the image and use specific details from both the visual content and the
                    location_str,
                    sms_str
                )
            }
        } else {
            if let Some(contact_name) = contact {
                format!(
                    r#"Write a 1-3 paragraph description of this moment based on the available information:
 Date: {}
 Location: {}
 Person/Contact: {}
 Messages: {}
 Use only the specific details provided above. The photo is from a folder for {}, so they are likely related to this moment. Mention people's names (especially {}), places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
                    date.format("%B %d, %Y"),
                    location_str,
                    contact_name,
                    sms_str,
                    contact_name,
                    contact_name
                )
            } else {
                format!(
                    r#"Write a 1-3 paragraph description of this moment based on the available information:
@@ -302,6 +442,7 @@ Use only the specific details provided above. Mention people's names, places, or
                    location_str,
                    sms_str
                )
            }
        };
        let system = custom_system.unwrap_or("You are a memory refreshing assistant who is able to provide insights through analyzing past conversations. Use only the information provided. Do not invent details.");
@@ -482,6 +623,18 @@ struct OllamaModel {
    name: String,
 }
 #[derive(Deserialize)]
 struct OllamaShowResponse {
    #[serde(default)]
    capabilities: Vec<String>,
 }
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct ModelCapabilities {
    pub name: String,
    pub has_vision: bool,
 }
 #[derive(Serialize)]
 struct OllamaEmbedRequest {
    model: String,