Add check for vision capabilities
This commit is contained in:
@@ -3,7 +3,7 @@ use opentelemetry::KeyValue;
|
|||||||
use opentelemetry::trace::{Span, Status, Tracer};
|
use opentelemetry::trace::{Span, Status, Tracer};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::ai::{InsightGenerator, OllamaClient};
|
use crate::ai::{InsightGenerator, ModelCapabilities, OllamaClient};
|
||||||
use crate::data::Claims;
|
use crate::data::Claims;
|
||||||
use crate::database::InsightDao;
|
use crate::database::InsightDao;
|
||||||
use crate::otel::{extract_context_from_request, global_tracer};
|
use crate::otel::{extract_context_from_request, global_tracer};
|
||||||
@@ -45,7 +45,7 @@ pub struct AvailableModelsResponse {
|
|||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
pub struct ServerModels {
|
pub struct ServerModels {
|
||||||
pub url: String,
|
pub url: String,
|
||||||
pub models: Vec<String>,
|
pub models: Vec<ModelCapabilities>,
|
||||||
pub default_model: String,
|
pub default_model: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,18 +211,18 @@ pub async fn get_all_insights_handler(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// GET /insights/models - List available models from both servers
|
/// GET /insights/models - List available models from both servers with capabilities
|
||||||
#[get("/insights/models")]
|
#[get("/insights/models")]
|
||||||
pub async fn get_available_models_handler(
|
pub async fn get_available_models_handler(
|
||||||
_claims: Claims,
|
_claims: Claims,
|
||||||
app_state: web::Data<crate::state::AppState>,
|
app_state: web::Data<crate::state::AppState>,
|
||||||
) -> impl Responder {
|
) -> impl Responder {
|
||||||
log::debug!("Fetching available models");
|
log::debug!("Fetching available models with capabilities");
|
||||||
|
|
||||||
let ollama_client = &app_state.ollama;
|
let ollama_client = &app_state.ollama;
|
||||||
|
|
||||||
// Fetch models from primary server
|
// Fetch models with capabilities from primary server
|
||||||
let primary_models = match OllamaClient::list_models(&ollama_client.primary_url).await {
|
let primary_models = match OllamaClient::list_models_with_capabilities(&ollama_client.primary_url).await {
|
||||||
Ok(models) => models,
|
Ok(models) => models,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
log::warn!("Failed to fetch models from primary server: {:?}", e);
|
log::warn!("Failed to fetch models from primary server: {:?}", e);
|
||||||
@@ -236,9 +236,9 @@ pub async fn get_available_models_handler(
|
|||||||
default_model: ollama_client.primary_model.clone(),
|
default_model: ollama_client.primary_model.clone(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Fetch models from fallback server if configured
|
// Fetch models with capabilities from fallback server if configured
|
||||||
let fallback = if let Some(fallback_url) = &ollama_client.fallback_url {
|
let fallback = if let Some(fallback_url) = &ollama_client.fallback_url {
|
||||||
match OllamaClient::list_models(fallback_url).await {
|
match OllamaClient::list_models_with_capabilities(fallback_url).await {
|
||||||
Ok(models) => Some(ServerModels {
|
Ok(models) => Some(ServerModels {
|
||||||
url: fallback_url.clone(),
|
url: fallback_url.clone(),
|
||||||
models,
|
models,
|
||||||
|
|||||||
@@ -961,23 +961,62 @@ impl InsightGenerator {
|
|||||||
combined_context.len()
|
combined_context.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
// 8. Load image and encode as base64 for vision models
|
// 8. Check if the model has vision capabilities
|
||||||
let image_base64 = match self.load_image_as_base64(&file_path) {
|
let model_to_check = ollama_client.primary_model.clone();
|
||||||
|
let has_vision = match OllamaClient::check_model_capabilities(
|
||||||
|
&ollama_client.primary_url,
|
||||||
|
&model_to_check,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(capabilities) => {
|
||||||
|
log::info!(
|
||||||
|
"Model '{}' vision capability: {}",
|
||||||
|
model_to_check,
|
||||||
|
capabilities.has_vision
|
||||||
|
);
|
||||||
|
capabilities.has_vision
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
log::warn!(
|
||||||
|
"Failed to check vision capabilities for model '{}', assuming no vision support: {}",
|
||||||
|
model_to_check,
|
||||||
|
e
|
||||||
|
);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
insight_cx
|
||||||
|
.span()
|
||||||
|
.set_attribute(KeyValue::new("model_has_vision", has_vision));
|
||||||
|
|
||||||
|
// 9. Load image and encode as base64 only if model supports vision
|
||||||
|
let image_base64 = if has_vision {
|
||||||
|
match self.load_image_as_base64(&file_path) {
|
||||||
Ok(b64) => {
|
Ok(b64) => {
|
||||||
log::info!("Successfully loaded image for vision model");
|
log::info!("Successfully loaded image for vision-capable model '{}'", model_to_check);
|
||||||
Some(b64)
|
Some(b64)
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
log::warn!("Failed to load image for vision model: {}", e);
|
log::warn!("Failed to load image for vision model: {}", e);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log::info!(
|
||||||
|
"Model '{}' does not support vision, skipping image processing",
|
||||||
|
model_to_check
|
||||||
|
);
|
||||||
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
// 9. Generate title and summary with Ollama (using multi-source context + image)
|
// 10. Generate title and summary with Ollama (using multi-source context + image if supported)
|
||||||
let title = ollama_client
|
let title = ollama_client
|
||||||
.generate_photo_title(
|
.generate_photo_title(
|
||||||
date_taken,
|
date_taken,
|
||||||
location.as_deref(),
|
location.as_deref(),
|
||||||
|
contact.as_deref(),
|
||||||
Some(&combined_context),
|
Some(&combined_context),
|
||||||
custom_system_prompt.as_deref(),
|
custom_system_prompt.as_deref(),
|
||||||
image_base64.clone(),
|
image_base64.clone(),
|
||||||
@@ -988,6 +1027,7 @@ impl InsightGenerator {
|
|||||||
.generate_photo_summary(
|
.generate_photo_summary(
|
||||||
date_taken,
|
date_taken,
|
||||||
location.as_deref(),
|
location.as_deref(),
|
||||||
|
contact.as_deref(),
|
||||||
Some(&combined_context),
|
Some(&combined_context),
|
||||||
custom_system_prompt.as_deref(),
|
custom_system_prompt.as_deref(),
|
||||||
image_base64,
|
image_base64,
|
||||||
@@ -1004,7 +1044,7 @@ impl InsightGenerator {
|
|||||||
.span()
|
.span()
|
||||||
.set_attribute(KeyValue::new("summary_length", summary.len() as i64));
|
.set_attribute(KeyValue::new("summary_length", summary.len() as i64));
|
||||||
|
|
||||||
// 9. Store in database
|
// 11. Store in database
|
||||||
let insight = InsertPhotoInsight {
|
let insight = InsertPhotoInsight {
|
||||||
file_path: file_path.to_string(),
|
file_path: file_path.to_string(),
|
||||||
title,
|
title,
|
||||||
|
|||||||
@@ -11,5 +11,5 @@ pub use handlers::{
|
|||||||
get_available_models_handler, get_insight_handler,
|
get_available_models_handler, get_insight_handler,
|
||||||
};
|
};
|
||||||
pub use insight_generator::InsightGenerator;
|
pub use insight_generator::InsightGenerator;
|
||||||
pub use ollama::OllamaClient;
|
pub use ollama::{ModelCapabilities, OllamaClient};
|
||||||
pub use sms_client::{SmsApiClient, SmsMessage};
|
pub use sms_client::{SmsApiClient, SmsMessage};
|
||||||
|
|||||||
153
src/ai/ollama.rs
153
src/ai/ollama.rs
@@ -62,6 +62,67 @@ impl OllamaClient {
|
|||||||
Ok(models.iter().any(|m| m == model_name))
|
Ok(models.iter().any(|m| m == model_name))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if a model has vision capabilities using the /api/show endpoint
|
||||||
|
pub async fn check_model_capabilities(url: &str, model_name: &str) -> Result<ModelCapabilities> {
|
||||||
|
let client = Client::builder()
|
||||||
|
.connect_timeout(Duration::from_secs(5))
|
||||||
|
.timeout(Duration::from_secs(10))
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct ShowRequest {
|
||||||
|
model: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.post(&format!("{}/api/show", url))
|
||||||
|
.json(&ShowRequest {
|
||||||
|
model: model_name.to_string(),
|
||||||
|
})
|
||||||
|
.send()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
return Err(anyhow::anyhow!(
|
||||||
|
"Failed to get model details for {} from {}",
|
||||||
|
model_name,
|
||||||
|
url
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let show_response: OllamaShowResponse = response.json().await?;
|
||||||
|
|
||||||
|
// Check if "vision" is in the capabilities array
|
||||||
|
let has_vision = show_response.capabilities.iter().any(|cap| cap == "vision");
|
||||||
|
|
||||||
|
Ok(ModelCapabilities {
|
||||||
|
name: model_name.to_string(),
|
||||||
|
has_vision,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List all models with their capabilities from a server
|
||||||
|
pub async fn list_models_with_capabilities(url: &str) -> Result<Vec<ModelCapabilities>> {
|
||||||
|
let models = Self::list_models(url).await?;
|
||||||
|
let mut capabilities = Vec::new();
|
||||||
|
|
||||||
|
for model_name in models {
|
||||||
|
match Self::check_model_capabilities(url, &model_name).await {
|
||||||
|
Ok(cap) => capabilities.push(cap),
|
||||||
|
Err(e) => {
|
||||||
|
log::warn!("Failed to get capabilities for model {}: {}", model_name, e);
|
||||||
|
// Fallback: assume no vision if we can't check
|
||||||
|
capabilities.push(ModelCapabilities {
|
||||||
|
name: model_name,
|
||||||
|
has_vision: false,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(capabilities)
|
||||||
|
}
|
||||||
|
|
||||||
/// Extract final answer from thinking model output
|
/// Extract final answer from thinking model output
|
||||||
/// Handles <think>...</think> tags and takes everything after
|
/// Handles <think>...</think> tags and takes everything after
|
||||||
fn extract_final_answer(&self, response: &str) -> String {
|
fn extract_final_answer(&self, response: &str) -> String {
|
||||||
@@ -216,6 +277,7 @@ impl OllamaClient {
|
|||||||
&self,
|
&self,
|
||||||
date: NaiveDate,
|
date: NaiveDate,
|
||||||
location: Option<&str>,
|
location: Option<&str>,
|
||||||
|
contact: Option<&str>,
|
||||||
sms_summary: Option<&str>,
|
sms_summary: Option<&str>,
|
||||||
custom_system: Option<&str>,
|
custom_system: Option<&str>,
|
||||||
image_base64: Option<String>,
|
image_base64: Option<String>,
|
||||||
@@ -224,6 +286,25 @@ impl OllamaClient {
|
|||||||
let sms_str = sms_summary.unwrap_or("No messages");
|
let sms_str = sms_summary.unwrap_or("No messages");
|
||||||
|
|
||||||
let prompt = if image_base64.is_some() {
|
let prompt = if image_base64.is_some() {
|
||||||
|
if let Some(contact_name) = contact {
|
||||||
|
format!(
|
||||||
|
r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
|
||||||
|
|
||||||
|
Date: {}
|
||||||
|
Location: {}
|
||||||
|
Person/Contact: {}
|
||||||
|
Messages: {}
|
||||||
|
|
||||||
|
Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. If limited information is available, use a simple descriptive title based on what you see.
|
||||||
|
|
||||||
|
Return ONLY the title, nothing else."#,
|
||||||
|
date.format("%B %d, %Y"),
|
||||||
|
location_str,
|
||||||
|
contact_name,
|
||||||
|
sms_str,
|
||||||
|
contact_name
|
||||||
|
)
|
||||||
|
} else {
|
||||||
format!(
|
format!(
|
||||||
r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
|
r#"Create a short title (maximum 8 words) about this moment by analyzing the image and context:
|
||||||
|
|
||||||
@@ -238,6 +319,26 @@ Return ONLY the title, nothing else."#,
|
|||||||
location_str,
|
location_str,
|
||||||
sms_str
|
sms_str
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if let Some(contact_name) = contact {
|
||||||
|
format!(
|
||||||
|
r#"Create a short title (maximum 8 words) about this moment:
|
||||||
|
|
||||||
|
Date: {}
|
||||||
|
Location: {}
|
||||||
|
Person/Contact: {}
|
||||||
|
Messages: {}
|
||||||
|
|
||||||
|
Use specific details from the context above. The photo is from a folder for {}, so they are likely related to this moment. If no specific details are available, use a simple descriptive title.
|
||||||
|
|
||||||
|
Return ONLY the title, nothing else."#,
|
||||||
|
date.format("%B %d, %Y"),
|
||||||
|
location_str,
|
||||||
|
contact_name,
|
||||||
|
sms_str,
|
||||||
|
contact_name
|
||||||
|
)
|
||||||
} else {
|
} else {
|
||||||
format!(
|
format!(
|
||||||
r#"Create a short title (maximum 8 words) about this moment:
|
r#"Create a short title (maximum 8 words) about this moment:
|
||||||
@@ -253,6 +354,7 @@ Return ONLY the title, nothing else."#,
|
|||||||
location_str,
|
location_str,
|
||||||
sms_str
|
sms_str
|
||||||
)
|
)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let system = custom_system.unwrap_or("You are my long term memory assistant. Use only the information provided. Do not invent details.");
|
let system = custom_system.unwrap_or("You are my long term memory assistant. Use only the information provided. Do not invent details.");
|
||||||
@@ -269,6 +371,7 @@ Return ONLY the title, nothing else."#,
|
|||||||
&self,
|
&self,
|
||||||
date: NaiveDate,
|
date: NaiveDate,
|
||||||
location: Option<&str>,
|
location: Option<&str>,
|
||||||
|
contact: Option<&str>,
|
||||||
sms_summary: Option<&str>,
|
sms_summary: Option<&str>,
|
||||||
custom_system: Option<&str>,
|
custom_system: Option<&str>,
|
||||||
image_base64: Option<String>,
|
image_base64: Option<String>,
|
||||||
@@ -277,6 +380,24 @@ Return ONLY the title, nothing else."#,
|
|||||||
let sms_str = sms_summary.unwrap_or("No messages");
|
let sms_str = sms_summary.unwrap_or("No messages");
|
||||||
|
|
||||||
let prompt = if image_base64.is_some() {
|
let prompt = if image_base64.is_some() {
|
||||||
|
if let Some(contact_name) = contact {
|
||||||
|
format!(
|
||||||
|
r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
|
||||||
|
|
||||||
|
Date: {}
|
||||||
|
Location: {}
|
||||||
|
Person/Contact: {}
|
||||||
|
Messages: {}
|
||||||
|
|
||||||
|
Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. Mention people's names (especially {}), places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
|
||||||
|
date.format("%B %d, %Y"),
|
||||||
|
location_str,
|
||||||
|
contact_name,
|
||||||
|
sms_str,
|
||||||
|
contact_name,
|
||||||
|
contact_name
|
||||||
|
)
|
||||||
|
} else {
|
||||||
format!(
|
format!(
|
||||||
r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
|
r#"Write a 1-3 paragraph description of this moment by analyzing the image and the available context:
|
||||||
|
|
||||||
@@ -289,6 +410,25 @@ Analyze the image and use specific details from both the visual content and the
|
|||||||
location_str,
|
location_str,
|
||||||
sms_str
|
sms_str
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if let Some(contact_name) = contact {
|
||||||
|
format!(
|
||||||
|
r#"Write a 1-3 paragraph description of this moment based on the available information:
|
||||||
|
|
||||||
|
Date: {}
|
||||||
|
Location: {}
|
||||||
|
Person/Contact: {}
|
||||||
|
Messages: {}
|
||||||
|
|
||||||
|
Use only the specific details provided above. The photo is from a folder for {}, so they are likely related to this moment. Mention people's names (especially {}), places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
|
||||||
|
date.format("%B %d, %Y"),
|
||||||
|
location_str,
|
||||||
|
contact_name,
|
||||||
|
sms_str,
|
||||||
|
contact_name,
|
||||||
|
contact_name
|
||||||
|
)
|
||||||
} else {
|
} else {
|
||||||
format!(
|
format!(
|
||||||
r#"Write a 1-3 paragraph description of this moment based on the available information:
|
r#"Write a 1-3 paragraph description of this moment based on the available information:
|
||||||
@@ -302,6 +442,7 @@ Use only the specific details provided above. Mention people's names, places, or
|
|||||||
location_str,
|
location_str,
|
||||||
sms_str
|
sms_str
|
||||||
)
|
)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let system = custom_system.unwrap_or("You are a memory refreshing assistant who is able to provide insights through analyzing past conversations. Use only the information provided. Do not invent details.");
|
let system = custom_system.unwrap_or("You are a memory refreshing assistant who is able to provide insights through analyzing past conversations. Use only the information provided. Do not invent details.");
|
||||||
@@ -482,6 +623,18 @@ struct OllamaModel {
|
|||||||
name: String,
|
name: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct OllamaShowResponse {
|
||||||
|
#[serde(default)]
|
||||||
|
capabilities: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
|
pub struct ModelCapabilities {
|
||||||
|
pub name: String,
|
||||||
|
pub has_vision: bool,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct OllamaEmbedRequest {
|
struct OllamaEmbedRequest {
|
||||||
model: String,
|
model: String,
|
||||||
|
|||||||
Reference in New Issue
Block a user