Add check for vision capabilities

This commit is contained in:
Cameron
2026-01-11 15:22:24 -05:00
parent 5b35df4007
commit ad0bba63b4
4 changed files with 235 additions and 42 deletions

View File

@@ -3,7 +3,7 @@ use opentelemetry::KeyValue;
use opentelemetry::trace::{Span, Status, Tracer};
use serde::{Deserialize, Serialize};
use crate::ai::{InsightGenerator, OllamaClient};
use crate::ai::{InsightGenerator, ModelCapabilities, OllamaClient};
use crate::data::Claims;
use crate::database::InsightDao;
use crate::otel::{extract_context_from_request, global_tracer};
@@ -45,7 +45,7 @@ pub struct AvailableModelsResponse {
#[derive(Debug, Serialize)]
pub struct ServerModels {
pub url: String,
pub models: Vec<String>,
pub models: Vec<ModelCapabilities>,
pub default_model: String,
}
@@ -211,18 +211,18 @@ pub async fn get_all_insights_handler(
}
}
/// GET /insights/models - List available models from both servers
/// GET /insights/models - List available models from both servers with capabilities
#[get("/insights/models")]
pub async fn get_available_models_handler(
_claims: Claims,
app_state: web::Data<crate::state::AppState>,
) -> impl Responder {
log::debug!("Fetching available models");
log::debug!("Fetching available models with capabilities");
let ollama_client = &app_state.ollama;
// Fetch models from primary server
let primary_models = match OllamaClient::list_models(&ollama_client.primary_url).await {
// Fetch models with capabilities from primary server
let primary_models = match OllamaClient::list_models_with_capabilities(&ollama_client.primary_url).await {
Ok(models) => models,
Err(e) => {
log::warn!("Failed to fetch models from primary server: {:?}", e);
@@ -236,9 +236,9 @@ pub async fn get_available_models_handler(
default_model: ollama_client.primary_model.clone(),
};
// Fetch models from fallback server if configured
// Fetch models with capabilities from fallback server if configured
let fallback = if let Some(fallback_url) = &ollama_client.fallback_url {
match OllamaClient::list_models(fallback_url).await {
match OllamaClient::list_models_with_capabilities(fallback_url).await {
Ok(models) => Some(ServerModels {
url: fallback_url.clone(),
models,

View File

@@ -961,23 +961,62 @@ impl InsightGenerator {
combined_context.len()
);
// 8. Load image and encode as base64 for vision models
let image_base64 = match self.load_image_as_base64(&file_path) {
// 8. Check if the model has vision capabilities
let model_to_check = ollama_client.primary_model.clone();
let has_vision = match OllamaClient::check_model_capabilities(
&ollama_client.primary_url,
&model_to_check,
)
.await
{
Ok(capabilities) => {
log::info!(
"Model '{}' vision capability: {}",
model_to_check,
capabilities.has_vision
);
capabilities.has_vision
}
Err(e) => {
log::warn!(
"Failed to check vision capabilities for model '{}', assuming no vision support: {}",
model_to_check,
e
);
false
}
};
insight_cx
.span()
.set_attribute(KeyValue::new("model_has_vision", has_vision));
// 9. Load image and encode as base64 only if model supports vision
let image_base64 = if has_vision {
match self.load_image_as_base64(&file_path) {
Ok(b64) => {
log::info!("Successfully loaded image for vision model");
log::info!("Successfully loaded image for vision-capable model '{}'", model_to_check);
Some(b64)
}
Err(e) => {
log::warn!("Failed to load image for vision model: {}", e);
None
}
}
} else {
log::info!(
"Model '{}' does not support vision, skipping image processing",
model_to_check
);
None
};
// 9. Generate title and summary with Ollama (using multi-source context + image)
// 10. Generate title and summary with Ollama (using multi-source context + image if supported)
let title = ollama_client
.generate_photo_title(
date_taken,
location.as_deref(),
contact.as_deref(),
Some(&combined_context),
custom_system_prompt.as_deref(),
image_base64.clone(),
@@ -988,6 +1027,7 @@ impl InsightGenerator {
.generate_photo_summary(
date_taken,
location.as_deref(),
contact.as_deref(),
Some(&combined_context),
custom_system_prompt.as_deref(),
image_base64,
@@ -1004,7 +1044,7 @@ impl InsightGenerator {
.span()
.set_attribute(KeyValue::new("summary_length", summary.len() as i64));
// 9. Store in database
// 11. Store in database
let insight = InsertPhotoInsight {
file_path: file_path.to_string(),
title,

View File

@@ -11,5 +11,5 @@ pub use handlers::{
get_available_models_handler, get_insight_handler,
};
pub use insight_generator::InsightGenerator;
pub use ollama::OllamaClient;
pub use ollama::{ModelCapabilities, OllamaClient};
pub use sms_client::{SmsApiClient, SmsMessage};

View File

@@ -62,6 +62,67 @@ impl OllamaClient {
Ok(models.iter().any(|m| m == model_name))
}
/// Check if a model has vision capabilities using the /api/show endpoint
pub async fn check_model_capabilities(url: &str, model_name: &str) -> Result<ModelCapabilities> {
let client = Client::builder()
.connect_timeout(Duration::from_secs(5))
.timeout(Duration::from_secs(10))
.build()?;
#[derive(Serialize)]
struct ShowRequest {
model: String,
}
let response = client
.post(&format!("{}/api/show", url))
.json(&ShowRequest {
model: model_name.to_string(),
})
.send()
.await?;
if !response.status().is_success() {
return Err(anyhow::anyhow!(
"Failed to get model details for {} from {}",
model_name,
url
));
}
let show_response: OllamaShowResponse = response.json().await?;
// Check if "vision" is in the capabilities array
let has_vision = show_response.capabilities.iter().any(|cap| cap == "vision");
Ok(ModelCapabilities {
name: model_name.to_string(),
has_vision,
})
}
/// List all models with their capabilities from a server
pub async fn list_models_with_capabilities(url: &str) -> Result<Vec<ModelCapabilities>> {
let models = Self::list_models(url).await?;
let mut capabilities = Vec::new();
for model_name in models {
match Self::check_model_capabilities(url, &model_name).await {
Ok(cap) => capabilities.push(cap),
Err(e) => {
log::warn!("Failed to get capabilities for model {}: {}", model_name, e);
// Fallback: assume no vision if we can't check
capabilities.push(ModelCapabilities {
name: model_name,
has_vision: false,
});
}
}
}
Ok(capabilities)
}
/// Extract final answer from thinking model output
/// Handles <think>...</think> tags and takes everything after
fn extract_final_answer(&self, response: &str) -> String {
@@ -216,6 +277,7 @@ impl OllamaClient {
&self,
date: NaiveDate,
location: Option<&str>,
contact: Option<&str>,
sms_summary: Option<&str>,
custom_system: Option<&str>,
image_base64: Option<String>,
@@ -224,6 +286,25 @@ impl OllamaClient {
let sms_str = sms_summary.unwrap_or("No messages");
let prompt = if image_base64.is_some() {
if let Some(contact_name) = contact {
format!(
r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
Date: {}
Location: {}
Person/Contact: {}
Messages: {}
Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. If limited information is available, use a simple descriptive title based on what you see.
Return ONLY the title, nothing else."#,
date.format("%B %d, %Y"),
location_str,
contact_name,
sms_str,
contact_name
)
} else {
format!(
r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
@@ -238,6 +319,26 @@ Return ONLY the title, nothing else."#,
location_str,
sms_str
)
}
} else {
if let Some(contact_name) = contact {
format!(
r#"Create a short title (maximum 8 words) about this moment:
Date: {}
Location: {}
Person/Contact: {}
Messages: {}
Use specific details from the context above. The photo is from a folder for {}, so they are likely related to this moment. If no specific details are available, use a simple descriptive title.
Return ONLY the title, nothing else."#,
date.format("%B %d, %Y"),
location_str,
contact_name,
sms_str,
contact_name
)
} else {
format!(
r#"Create a short title (maximum 8 words) about this moment:
@@ -253,6 +354,7 @@ Return ONLY the title, nothing else."#,
location_str,
sms_str
)
}
};
let system = custom_system.unwrap_or("You are my long term memory assistant. Use only the information provided. Do not invent details.");
@@ -269,6 +371,7 @@ Return ONLY the title, nothing else."#,
&self,
date: NaiveDate,
location: Option<&str>,
contact: Option<&str>,
sms_summary: Option<&str>,
custom_system: Option<&str>,
image_base64: Option<String>,
@@ -277,6 +380,24 @@ Return ONLY the title, nothing else."#,
let sms_str = sms_summary.unwrap_or("No messages");
let prompt = if image_base64.is_some() {
if let Some(contact_name) = contact {
format!(
r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
Date: {}
Location: {}
Person/Contact: {}
Messages: {}
Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. Mention people's names (especially {}), places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
date.format("%B %d, %Y"),
location_str,
contact_name,
sms_str,
contact_name,
contact_name
)
} else {
format!(
r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
@@ -289,6 +410,25 @@ Analyze the image and use specific details from both the visual content and the
location_str,
sms_str
)
}
} else {
if let Some(contact_name) = contact {
format!(
r#"Write a 1-3 paragraph description of this moment based on the available information:
Date: {}
Location: {}
Person/Contact: {}
Messages: {}
Use only the specific details provided above. The photo is from a folder for {}, so they are likely related to this moment. Mention people's names (especially {}), places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
date.format("%B %d, %Y"),
location_str,
contact_name,
sms_str,
contact_name,
contact_name
)
} else {
format!(
r#"Write a 1-3 paragraph description of this moment based on the available information:
@@ -302,6 +442,7 @@ Use only the specific details provided above. Mention people's names, places, or
location_str,
sms_str
)
}
};
let system = custom_system.unwrap_or("You are a memory refreshing assistant who is able to provide insights through analyzing past conversations. Use only the information provided. Do not invent details.");
@@ -482,6 +623,18 @@ struct OllamaModel {
name: String,
}
#[derive(Deserialize)]
struct OllamaShowResponse {
#[serde(default)]
capabilities: Vec<String>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct ModelCapabilities {
pub name: String,
pub has_vision: bool,
}
#[derive(Serialize)]
struct OllamaEmbedRequest {
model: String,