Merge pull request 'feature/llamacpp-backend' (#101) from feature/llamacpp-backend into master

Reviewed-on: #101
This commit was merged in pull request #101.
This commit is contained in:
2026-05-26 18:58:47 +00:00
17 changed files with 2359 additions and 605 deletions

View File

@@ -53,6 +53,33 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
# OPENROUTER_HTTP_REFERER=https://your-site.example
# OPENROUTER_APP_TITLE=ImageApi
# ── AI Insights — local backend switch ──────────────────────────────────
# Picks which local LLM stack the server uses for chat, vision describe,
# and embeddings. `ollama` (default) uses the OLLAMA_* settings above;
# `llamacpp` uses the LLAMA_SWAP_* settings below. The switch is global
# and applies to both `backend=local` and `backend=hybrid` (hybrid keeps
# chat on OpenRouter but still uses this stack for the describe pass).
# Don't flip mid-deploy without re-embedding existing index rows —
# mixed vector spaces break similarity search.
# LLM_BACKEND=ollama
# ── AI Insights — llama.cpp / llama-swap (optional) ─────────────────────
# Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack
# off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting
# per-slot llama-server instances. Chat models receive images directly
# via content-parts (vision-capable models assumed); a separate vision
# slot is used only by the describe_photo tool and describe-image utility.
# LLAMA_SWAP_URL=http://localhost:9292/v1
# LLAMA_SWAP_PRIMARY_MODEL=chat
# Optional dedicated vision slot for describe_image. Defaults to
# PRIMARY_MODEL so describe_photo works without extra config.
# LLAMA_SWAP_VISION_MODEL=vision
# LLAMA_SWAP_EMBEDDING_MODEL=embed
# Comma-separated allowlist surfaced by /insights/models when
# LLM_BACKEND=llamacpp. All report has_vision=true.
# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
# ── AI Insights — sibling services (optional) ───────────────────────────
# Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
# typically set only APOLLO_API_BASE_URL and let the face + CLIP

View File

@@ -473,7 +473,7 @@ GET /memories?path=...&recursive=true
POST /insights/generate (non-agentic single-shot)
POST /insights/generate/agentic (tool-calling loop; body: { file_path, backend?, model?, ... })
GET /insights?path=...&library=...
GET /insights/models (local Ollama models + capabilities)
GET /insights/models (local-backend models + capabilities; Ollama OR llama-swap based on LLM_BACKEND)
GET /insights/openrouter/models (curated OpenRouter allowlist)
POST /insights/rate (thumbs up/down for training data)
@@ -631,6 +631,27 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small # Optional, embeddings
OPENROUTER_HTTP_REFERER=https://your-site.example # Optional attribution header
OPENROUTER_APP_TITLE=ImageApi # Optional attribution header
# Local LLM backend switch. `ollama` (default) keeps the OLLAMA_* settings
# above; `llamacpp` swaps the entire local stack (chat + vision describe +
# embeddings) over to llama-swap. The switch is global and applies to
# `backend=local` requests and to `backend=hybrid`'s describe pass (hybrid
# chat still goes to OpenRouter). Don't flip mid-deploy without
# re-embedding — mixed vector spaces break similarity search.
LLM_BACKEND=ollama
# llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible
# proxy hosting one or more llama-server processes. Chat models receive
# images directly via content-parts (all models assumed vision-capable).
LLAMA_SWAP_URL=http://localhost:9292/v1 # Required when LLM_BACKEND=llamacpp
LLAMA_SWAP_PRIMARY_MODEL=chat # Chat slot id (matches config.yaml)
LLAMA_SWAP_VISION_MODEL= # Dedicated vision slot for describe_image / describe_photo
# tool. Defaults to PRIMARY_MODEL when unset.
LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id
LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by GET /insights/models
# when LLM_BACKEND=llamacpp. All report has_vision=true.
# Empty = picker shows only the configured primary model.
LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload
# Insight Chat Continuation
AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6)
```
@@ -650,10 +671,50 @@ The `OllamaClient` provides methods to query available models:
This allows runtime verification of model availability before generating insights.
**Local backend switch (`LLM_BACKEND`):**
One env var decides which "local" stack the server runs against — `ollama`
(default) or `llamacpp`. It's global on purpose: chat, vision, and
embeddings all route through the same backend, so the embedding-vector
column in SQLite stays in one vector space. Don't flip mid-deploy without
re-embedding the affected rows — similarity search will collapse.
- `LLM_BACKEND=ollama`: chat, vision, and embeddings use Ollama. Vision
capability is probed per-model via `/api/show`.
- `LLM_BACKEND=llamacpp`: chat models receive images directly via OpenAI
content-parts (all models assumed vision-capable). Embeddings hit the
`embed` slot. A dedicated `LLAMA_SWAP_VISION_MODEL` slot (defaults to
the chat model) handles `describe_image` for the `describe_photo` tool.
Requires `LLAMA_SWAP_URL`.
The per-request `backend=hybrid` override is orthogonal: it always sends
chat to OpenRouter (text-only, images are pre-described and inlined), but
the describe + embed passes still route through whichever `LLM_BACKEND`
is configured.
**Backend dispatch (`ResolvedBackend`):**
`InsightGenerator::resolve_backend(kind, overrides)` is the single entry
point that builds clients for a request. Returns a `ResolvedBackend` with
two roles: `.chat()` (the agentic/chat client) and `.local()` (local-only
utility calls: rerank, describe_image, embeddings). `BackendKind` is an
enum (`Local` | `Hybrid`) replacing the stringly-typed `"local"` /
`"hybrid"` labels. `SamplingOverrides` groups model/ctx/temp/top_p/top_k/
min_p per-request overrides. All downstream code (`execute_tool`,
`run_streaming_agentic_loop`, etc.) takes `&ResolvedBackend` rather than
individual client references.
`GET /insights/models` returns the local-backend models with capabilities
in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers
when `ollama`, llama-swap slots (from `LLAMA_SWAP_ALLOWED_MODELS`) when
`llamacpp`. No `/insights/llamacpp/models` — the picker reads a single
endpoint.
**Hybrid Backend (OpenRouter):**
- Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`.
- Local Ollama still describes the image (vision); the description is inlined
into the chat prompt and the agentic loop runs on OpenRouter.
- Vision describe happens before the agentic loop; the description is inlined
into the chat prompt and the agentic loop runs on OpenRouter. Vision
routes through whichever `LLM_BACKEND` is configured.
- `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that
call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`.
- No live capability precheck — the operator-curated allowlist is trusted.
@@ -661,6 +722,15 @@ This allows runtime verification of model availability before generating insight
- `GET /insights/openrouter/models` returns `{ models, default_model, configured }`
for client picker UIs.
**Cross-replay matrix (chat continuation):**
- `local → local` allowed (whether served by Ollama or llama-swap; that's
a deploy-time decision, not a request-time one).
- `hybrid → hybrid` allowed.
- `hybrid → local` allowed (the inlined description replays as text).
- `local → hybrid` rejected — the stored transcript has raw images in the
first user message and OpenRouter providers don't accept that shape
consistently. Regenerate the insight in hybrid mode instead.
**Insight Chat Continuation:**
After an agentic insight is generated, the full `Vec<ChatMessage>` transcript is

2
Cargo.lock generated
View File

@@ -2051,7 +2051,7 @@ dependencies = [
[[package]]
name = "image-api"
version = "1.1.0"
version = "1.2.0"
dependencies = [
"actix",
"actix-cors",

View File

@@ -1,6 +1,6 @@
[package]
name = "image-api"
version = "1.1.0"
version = "1.2.0"
authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
edition = "2024"

140
src/ai/backend.rs Normal file
View File

@@ -0,0 +1,140 @@
use anyhow::{Result, anyhow};
use crate::ai::llm_client::LlmClient;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BackendKind {
Local,
Hybrid,
}
impl BackendKind {
pub fn parse(s: &str) -> Result<Self> {
match s.trim().to_lowercase().as_str() {
"local" | "" => Ok(Self::Local),
"hybrid" => Ok(Self::Hybrid),
other => Err(anyhow!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
other
)),
}
}
pub fn as_str(&self) -> &'static str {
match self {
Self::Local => "local",
Self::Hybrid => "hybrid",
}
}
}
impl std::fmt::Display for BackendKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
pub struct SamplingOverrides {
pub model: Option<String>,
pub num_ctx: Option<i32>,
pub temperature: Option<f32>,
pub top_p: Option<f32>,
pub top_k: Option<i32>,
pub min_p: Option<f32>,
}
impl SamplingOverrides {
pub fn has_sampling(&self) -> bool {
self.temperature.is_some()
|| self.top_p.is_some()
|| self.top_k.is_some()
|| self.min_p.is_some()
}
}
pub struct ResolvedBackend {
chat: Box<dyn LlmClient>,
local: Box<dyn LlmClient>,
pub kind: BackendKind,
/// `true` when the chat model receives images directly (Ollama with
/// vision, or llamacpp). `false` for hybrid where we describe-then-inline.
pub images_inline: bool,
}
impl ResolvedBackend {
pub fn new(
chat: Box<dyn LlmClient>,
local: Box<dyn LlmClient>,
kind: BackendKind,
images_inline: bool,
) -> Self {
Self {
chat,
local,
kind,
images_inline,
}
}
pub fn chat(&self) -> &dyn LlmClient {
self.chat.as_ref()
}
pub fn local(&self) -> &dyn LlmClient {
self.local.as_ref()
}
pub fn model(&self) -> &str {
self.chat.primary_model()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_backend_kind() {
assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local);
assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid);
assert_eq!(BackendKind::parse(" Local ").unwrap(), BackendKind::Local);
assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid);
assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local);
assert!(BackendKind::parse("vllm").is_err());
}
#[test]
fn backend_kind_as_str_roundtrips() {
assert_eq!(
BackendKind::parse(BackendKind::Local.as_str()).unwrap(),
BackendKind::Local
);
assert_eq!(
BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(),
BackendKind::Hybrid
);
}
#[test]
fn sampling_overrides_has_sampling() {
let empty = SamplingOverrides {
model: None,
num_ctx: None,
temperature: None,
top_p: None,
top_k: None,
min_p: None,
};
assert!(!empty.has_sampling());
let with_temp = SamplingOverrides {
model: None,
num_ctx: Some(4096),
temperature: Some(0.7),
top_p: None,
top_k: None,
min_p: None,
};
assert!(with_temp.has_sampling());
}
}

View File

@@ -470,7 +470,16 @@ pub async fn generate_agentic_insight_handler(
}
}
/// GET /insights/models - List available models from both servers with capabilities
/// GET /insights/models - Local-backend models with capabilities. Returns
/// Ollama servers when `LLM_BACKEND=ollama` (default), or llama-swap slots
/// when `LLM_BACKEND=llamacpp`. Same envelope shape either way so the
/// client picker doesn't have to branch on backend kind.
///
/// For llama-swap: `models` comes verbatim from `LLAMA_SWAP_ALLOWED_MODELS`
/// (no live `/v1/models` probe), `has_vision` is true only for the
/// configured `LLAMA_SWAP_VISION_MODEL` slot id, and `has_tool_calling` is
/// reported as true for every slot (llama-server is launched with `--jinja`
/// by convention — a misconfigured slot surfaces as a chat-call error).
#[get("/insights/models")]
pub async fn get_available_models_handler(
_claims: Claims,
@@ -478,6 +487,29 @@ pub async fn get_available_models_handler(
) -> impl Responder {
log::debug!("Fetching available models with capabilities");
if crate::ai::local_backend_is_llamacpp()
&& let Some(lc) = app_state.llamacpp.as_ref()
{
let models: Vec<ModelCapabilities> = app_state
.llamacpp_allowed_models
.iter()
.map(|name| ModelCapabilities {
name: name.clone(),
has_vision: true,
has_tool_calling: true,
})
.collect();
let primary = ServerModels {
url: lc.base_url.clone(),
models,
default_model: lc.primary_model.clone(),
};
return HttpResponse::Ok().json(AvailableModelsResponse {
primary,
fallback: None,
});
}
let ollama_client = &app_state.ollama;
// Fetch models with capabilities from primary server

View File

@@ -6,10 +6,9 @@ use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use tokio::sync::Mutex as TokioMutex;
use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
use crate::ai::insight_generator::InsightGenerator;
use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
use crate::ai::ollama::OllamaClient;
use crate::ai::openrouter::OpenRouterClient;
use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool};
use crate::database::InsightDao;
use crate::database::models::InsertPhotoInsight;
use crate::otel::global_tracer;
@@ -91,8 +90,6 @@ pub struct ChatTurnResult {
#[derive(Clone)]
pub struct InsightChatService {
generator: Arc<InsightGenerator>,
ollama: OllamaClient,
openrouter: Option<Arc<OpenRouterClient>>,
insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
chat_locks: ChatLockMap,
}
@@ -100,15 +97,11 @@ pub struct InsightChatService {
impl InsightChatService {
pub fn new(
generator: Arc<InsightGenerator>,
ollama: OllamaClient,
openrouter: Option<Arc<OpenRouterClient>>,
insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
chat_locks: ChatLockMap,
) -> Self {
Self {
generator,
ollama,
openrouter,
insight_dao,
chat_locks,
}
@@ -303,24 +296,10 @@ impl InsightChatService {
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| stored_backend.clone());
if !matches!(effective_backend.as_str(), "local" | "hybrid") {
bail!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
effective_backend
);
}
if stored_backend == "local" && effective_backend == "hybrid" {
bail!(
"switching from local to hybrid mid-chat isn't supported yet; \
regenerate the insight in hybrid mode if you want OpenRouter chat"
);
}
let is_hybrid = effective_backend == "hybrid";
span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
validate_cross_replay(&stored_backend, &effective_backend)?;
let kind = BackendKind::parse(&effective_backend)?;
span.set_attribute(KeyValue::new("backend", kind.as_str()));
// 4. Build the chat backend client. Ollama in local mode, a freshly
// cloned OpenRouter client in hybrid mode (clone so per-request
// sampling/model overrides don't leak into shared state).
let max_iterations = req
.max_iterations
.unwrap_or(DEFAULT_MAX_ITERATIONS)
@@ -328,91 +307,38 @@ impl InsightChatService {
span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
let stored_model = insight.model_version.clone();
let custom_model = req
let overrides = SamplingOverrides {
model: req
.model
.clone()
.or_else(|| Some(stored_model.clone()))
.filter(|m| !m.is_empty());
let mut ollama_client = self.ollama.clone();
let mut openrouter_client: Option<OpenRouterClient> = None;
if is_hybrid {
let arc = self.openrouter.as_ref().ok_or_else(|| {
anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
})?;
let mut c: OpenRouterClient = (**arc).clone();
if let Some(ref m) = custom_model {
c.primary_model = m.clone();
}
if req.temperature.is_some()
|| req.top_p.is_some()
|| req.top_k.is_some()
|| req.min_p.is_some()
{
c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
}
if let Some(ctx) = req.num_ctx {
c.set_num_ctx(Some(ctx));
}
openrouter_client = Some(c);
} else {
// Local-mode model swap. Build a new client when the chat model
// differs from the configured one (mirrors the agentic pattern).
if let Some(ref m) = custom_model
&& m != &self.ollama.primary_model
{
ollama_client = OllamaClient::new(
self.ollama.primary_url.clone(),
self.ollama.fallback_url.clone(),
m.clone(),
Some(m.clone()),
);
}
if req.temperature.is_some()
|| req.top_p.is_some()
|| req.top_k.is_some()
|| req.min_p.is_some()
{
ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
}
if let Some(ctx) = req.num_ctx {
ollama_client.set_num_ctx(Some(ctx));
}
}
let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client {
c
} else {
&ollama_client
.filter(|m| !m.is_empty()),
num_ctx: req.num_ctx,
temperature: req.temperature,
top_p: req.top_p,
top_k: req.top_k,
min_p: req.min_p,
};
let model_used = chat_backend.primary_model().to_string();
let backend = self.generator.resolve_backend(kind, &overrides).await?;
let model_used = backend.model().to_string();
span.set_attribute(KeyValue::new("model", model_used.clone()));
// 5. Decide vision + tool set. In hybrid we always omit
// `describe_photo` (matches the original generation flow). In
// local we trust the stored history's first-user shape: if it
// carries `images`, the original model was vision-capable, and
// we keep `describe_photo` available.
// 5. Decide vision + tool set. In hybrid (describe-then-inline) mode
// we omit `describe_photo`. Otherwise trust the stored history:
// if the first user message carries images, describe_photo stays.
let local_first_user_has_image = messages
.iter()
.find(|m| m.role == "user")
.and_then(|m| m.images.as_ref())
.map(|imgs| !imgs.is_empty())
.unwrap_or(false);
let offer_describe_tool = !is_hybrid && local_first_user_has_image;
// current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
// and probes the per-table presence flags. Pass `offer_describe_tool`
// directly — the `!is_hybrid && local_first_user_has_image` decision
// is the chat-path's vision predicate.
let offer_describe_tool = backend.images_inline && local_first_user_has_image;
let gate_opts = self.generator.current_gate_opts_for_persona(
offer_describe_tool,
Some((req.user_id, &active_persona)),
);
let tools = InsightGenerator::build_tool_definitions(gate_opts);
// Image base64 only needed when describe_photo is on the menu. Load
// lazily to avoid disk IO when the loop never invokes it.
let image_base64: Option<String> = if offer_describe_tool {
self.generator.load_image_as_base64(&normalized).ok()
} else {
@@ -461,13 +387,13 @@ impl InsightChatService {
iterations_used = iteration + 1;
log::info!("Chat iteration {}/{}", iterations_used, max_iterations);
let (response, prompt_tokens, eval_tokens) = chat_backend
let (response, prompt_tokens, eval_tokens) = backend
.chat()
.chat_with_tools(messages.clone(), tools.clone())
.await?;
last_prompt_eval_count = prompt_tokens;
last_eval_count = eval_tokens;
// Ollama rejects non-object tool-call arguments on replay.
let mut response = response;
if let Some(ref mut tcs) = response.tool_calls {
for tc in tcs.iter_mut() {
@@ -495,13 +421,11 @@ impl InsightChatService {
.execute_tool(
&tool_call.function.name,
&tool_call.function.arguments,
&ollama_client,
&backend,
&image_base64,
&normalized,
req.user_id,
&active_persona,
&model_used,
&effective_backend,
&loop_cx,
)
.await;
@@ -515,8 +439,6 @@ impl InsightChatService {
}
if final_content.is_empty() {
// The model never produced a final answer; ask once more without
// tools to force a textual reply.
log::info!(
"Chat loop exhausted after {} iterations, requesting final answer",
iterations_used
@@ -524,7 +446,8 @@ impl InsightChatService {
messages.push(ChatMessage::user(
"Please write your final answer now without calling any more tools.",
));
let (final_response, prompt_tokens, eval_tokens) = chat_backend
let (final_response, prompt_tokens, eval_tokens) = backend
.chat()
.chat_with_tools(messages.clone(), vec![])
.await?;
last_prompt_eval_count = prompt_tokens;
@@ -560,7 +483,8 @@ impl InsightChatService {
Capture the key moment or theme. Return ONLY the title, nothing else.",
final_content
);
let title_raw = chat_backend
let title_raw = backend
.chat()
.generate(
&title_prompt,
Some(
@@ -585,7 +509,7 @@ impl InsightChatService {
model_version: model_used.clone(),
is_current: true,
training_messages: Some(json),
backend: effective_backend.clone(),
backend: kind.as_str().to_string(),
fewshot_source_ids: None,
content_hash: None,
};
@@ -610,7 +534,7 @@ impl InsightChatService {
prompt_eval_count: last_prompt_eval_count,
eval_count: last_eval_count,
amended_insight_id,
backend_used: effective_backend,
backend_used: kind.as_str().to_string(),
model_used,
})
}
@@ -799,19 +723,8 @@ impl InsightChatService {
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| stored_backend.clone());
if !matches!(effective_backend.as_str(), "local" | "hybrid") {
bail!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
effective_backend
);
}
if stored_backend == "local" && effective_backend == "hybrid" {
bail!(
"switching from local to hybrid mid-chat isn't supported yet; \
regenerate the insight in hybrid mode if you want OpenRouter chat"
);
}
let is_hybrid = effective_backend == "hybrid";
let kind = BackendKind::parse(&effective_backend)?;
validate_cross_replay(&stored_backend, kind.as_str())?;
let max_iterations = req
.max_iterations
@@ -819,27 +732,31 @@ impl InsightChatService {
.clamp(1, env_max_iterations());
let stored_model = insight.model_version.clone();
let custom_model = req
let overrides = SamplingOverrides {
model: req
.model
.clone()
.or_else(|| Some(stored_model.clone()))
.filter(|m| !m.is_empty());
.filter(|m| !m.is_empty()),
num_ctx: req.num_ctx,
temperature: req.temperature,
top_p: req.top_p,
top_k: req.top_k,
min_p: req.min_p,
};
let backend = self.generator.resolve_backend(kind, &overrides).await?;
let model_used = backend.model().to_string();
let (chat_backend_holder, ollama_client) =
self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
let model_used = chat_backend.primary_model().to_string();
// Tool set — local mode + first user turn carries an image →
// offer describe_photo. Hybrid: visual description was inlined
// when the insight was bootstrapped, no describe tool needed.
// Tool set — images_inline mode + first user turn carries an image →
// offer describe_photo. Describe-then-inline mode (hybrid only):
// visual description was inlined at bootstrap, no describe tool needed.
let local_first_user_has_image = messages
.iter()
.find(|m| m.role == "user")
.and_then(|m| m.images.as_ref())
.map(|imgs| !imgs.is_empty())
.unwrap_or(false);
let offer_describe_tool = !is_hybrid && local_first_user_has_image;
let offer_describe_tool = backend.images_inline && local_first_user_has_image;
let gate_opts = self.generator.current_gate_opts_for_persona(
offer_describe_tool,
Some((req.user_id, &active_persona)),
@@ -870,16 +787,13 @@ impl InsightChatService {
let outcome = self
.run_streaming_agentic_loop(
chat_backend,
&ollama_client,
&backend,
&mut messages,
tools,
&image_base64,
&normalized,
req.user_id,
&active_persona,
&model_used,
&effective_backend,
max_iterations,
&tx,
)
@@ -907,7 +821,8 @@ impl InsightChatService {
let mut amended_insight_id: Option<i32> = None;
if req.amend {
let title = self.generate_title(chat_backend, &final_content).await?;
let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content);
let final_content = body;
// Amended rows intentionally do not inherit the parent's
// `fewshot_source_ids`. The parent's few-shot influence is still
@@ -923,7 +838,7 @@ impl InsightChatService {
model_version: model_used.clone(),
is_current: true,
training_messages: Some(json),
backend: effective_backend.clone(),
backend: kind.as_str().to_string(),
fewshot_source_ids: None,
content_hash: None,
};
@@ -949,7 +864,7 @@ impl InsightChatService {
eval_tokens: last_eval_count,
num_ctx: req.num_ctx,
amended_insight_id,
backend_used: effective_backend,
backend_used: kind.as_str().to_string(),
model_used,
})
.await;
@@ -975,18 +890,23 @@ impl InsightChatService {
.filter(|s| !s.trim().is_empty())
.unwrap_or_else(|| "default".to_string());
let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
let is_hybrid = effective_backend == "hybrid";
let kind = BackendKind::parse(&effective_backend)?;
let max_iterations = req
.max_iterations
.unwrap_or(DEFAULT_MAX_ITERATIONS)
.clamp(1, env_max_iterations());
let custom_model = req.model.clone().filter(|m| !m.is_empty());
let (chat_backend_holder, ollama_client) =
self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
let model_used = chat_backend.primary_model().to_string();
let overrides = SamplingOverrides {
model: req.model.clone().filter(|m| !m.is_empty()),
num_ctx: req.num_ctx,
temperature: req.temperature,
top_p: req.top_p,
top_k: req.top_k,
min_p: req.min_p,
};
let backend = self.generator.resolve_backend(kind, &overrides).await?;
let model_used = backend.model().to_string();
// Load image bytes once. RAW preview fallback is handled inside
// load_image_as_base64. Errors degrade silently — a chat that
@@ -1007,18 +927,17 @@ impl InsightChatService {
_ => None,
});
// Hybrid backend: pre-describe the image via local Ollama vision
// so OpenRouter chat models (which can't see images directly) get
// the visual description as text. Mirrors the same pre-describe
// pass that `generate_agentic_insight_for_photo` does for hybrid.
let visual_block = if is_hybrid {
// Describe-then-inline (hybrid only): pre-describe the image so a
// text-only chat model gets the visual description inline.
// images_inline backends send images directly to the chat model.
let visual_block = if !backend.images_inline {
match image_base64.as_deref() {
Some(b64) => match self.ollama.describe_image(b64).await {
Some(b64) => match backend.local().describe_image(b64).await {
Ok(desc) => {
format!("Visual description (from local vision model):\n{}\n", desc)
}
Err(e) => {
log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
String::new()
}
},
@@ -1028,10 +947,10 @@ impl InsightChatService {
String::new()
};
// Tool gates. Local + image present → expose describe_photo so
// the chat model can re-look at the photo on demand. Hybrid:
// Tool gates. images_inline + image present → expose describe_photo so
// the chat model can re-look at the photo on demand. Non-inline:
// already inlined, no tool needed.
let offer_describe_tool = !is_hybrid && image_base64.is_some();
let offer_describe_tool = backend.images_inline && image_base64.is_some();
let gate_opts = self.generator.current_gate_opts_for_persona(
offer_describe_tool,
Some((req.user_id, &active_persona)),
@@ -1057,23 +976,22 @@ impl InsightChatService {
);
let system_msg = ChatMessage::system(system_content);
let mut user_msg = ChatMessage::user(req.user_message.clone());
if !is_hybrid && let Some(ref img) = image_base64 {
if backend.images_inline {
if let Some(ref img) = image_base64 {
user_msg.images = Some(vec![img.clone()]);
}
}
let mut messages = vec![system_msg, user_msg];
let outcome = self
.run_streaming_agentic_loop(
chat_backend,
&ollama_client,
&backend,
&mut messages,
tools,
&image_base64,
&normalized,
req.user_id,
&active_persona,
&model_used,
&effective_backend,
max_iterations,
&tx,
)
@@ -1086,7 +1004,7 @@ impl InsightChatService {
final_content,
} = outcome;
let title = self.generate_title(chat_backend, &final_content).await?;
let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content);
let json = serde_json::to_string(&messages)
.map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
@@ -1094,12 +1012,12 @@ impl InsightChatService {
library_id: req.library_id,
file_path: normalized.clone(),
title,
summary: final_content,
summary: body,
generated_at: Utc::now().timestamp(),
model_version: model_used.clone(),
is_current: true,
training_messages: Some(json),
backend: effective_backend.clone(),
backend: kind.as_str().to_string(),
fewshot_source_ids: None,
content_hash: None,
};
@@ -1122,7 +1040,7 @@ impl InsightChatService {
eval_tokens: last_eval_count,
num_ctx: req.num_ctx,
amended_insight_id: Some(stored.id),
backend_used: effective_backend,
backend_used: kind.as_str().to_string(),
model_used,
})
.await;
@@ -1130,105 +1048,19 @@ impl InsightChatService {
Ok(())
}
/// Set up chat clients (Ollama + optional OpenRouter) shared by
/// bootstrap and continuation. Returns the chat-side backend client
/// (boxed because hybrid and local return different concrete types)
/// and the Ollama client used for describe-image / local tool calls.
fn build_chat_clients(
&self,
is_hybrid: bool,
custom_model: Option<&str>,
req: &ChatTurnRequest,
) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
let mut ollama_client = self.ollama.clone();
if is_hybrid {
let arc = self.openrouter.as_ref().ok_or_else(|| {
anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
})?;
let mut c: OpenRouterClient = (**arc).clone();
if let Some(m) = custom_model {
c.primary_model = m.to_string();
}
if req.temperature.is_some()
|| req.top_p.is_some()
|| req.top_k.is_some()
|| req.min_p.is_some()
{
c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
}
if let Some(ctx) = req.num_ctx {
c.set_num_ctx(Some(ctx));
}
return Ok((Box::new(c), ollama_client));
}
if let Some(m) = custom_model
&& m != self.ollama.primary_model
{
ollama_client = OllamaClient::new(
self.ollama.primary_url.clone(),
self.ollama.fallback_url.clone(),
m.to_string(),
Some(m.to_string()),
);
}
if req.temperature.is_some()
|| req.top_p.is_some()
|| req.top_k.is_some()
|| req.min_p.is_some()
{
ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
}
if let Some(ctx) = req.num_ctx {
ollama_client.set_num_ctx(Some(ctx));
}
Ok((Box::new(ollama_client.clone()), ollama_client))
}
/// Generate a short title via the same chat backend so voice stays
/// consistent with the body. Mirrors generate_agentic_insight_for_photo's
/// titling pass.
async fn generate_title(
&self,
chat_backend: &dyn LlmClient,
final_content: &str,
) -> Result<String> {
let title_prompt = format!(
"Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\n\
Capture the key moment or theme. Return ONLY the title, nothing else.",
final_content
);
let title_raw = chat_backend
.generate(
&title_prompt,
Some(
"You are my long term memory assistant. Use only the information provided. Do not invent details.",
),
None,
)
.await?;
Ok(title_raw.trim().trim_matches('"').to_string())
}
/// Drive the agentic loop with streaming SSE events. Shared between
/// bootstrap and continuation. Mutates `messages` in place (response
/// turns + tool results are appended) and returns counters + the
/// final assistant content.
async fn run_streaming_agentic_loop(
&self,
chat_backend: &dyn LlmClient,
ollama_client: &OllamaClient,
backend: &ResolvedBackend,
messages: &mut Vec<ChatMessage>,
tools: Vec<Tool>,
image_base64: &Option<String>,
normalized: &str,
user_id: i32,
active_persona: &str,
// Provenance — stamped onto any store_fact tool call made
// during this loop. Mirrors the non-streaming chat path.
model_used: &str,
effective_backend: &str,
max_iterations: usize,
tx: &tokio::sync::mpsc::Sender<ChatStreamEvent>,
) -> Result<AgenticLoopOutcome> {
@@ -1247,7 +1079,8 @@ impl InsightChatService {
})
.await;
let mut stream = chat_backend
let mut stream = backend
.chat()
.chat_with_tools_stream(messages.clone(), tools.clone())
.await?;
@@ -1304,13 +1137,11 @@ impl InsightChatService {
.execute_tool(
&tool_call.function.name,
&tool_call.function.arguments,
ollama_client,
backend,
image_base64,
normalized,
user_id,
active_persona,
model_used,
effective_backend,
&cx,
)
.await;
@@ -1345,7 +1176,8 @@ impl InsightChatService {
messages.push(ChatMessage::user(
"Please write your final answer now without calling any more tools.",
));
let mut stream = chat_backend
let mut stream = backend
.chat()
.chat_with_tools_stream(messages.clone(), vec![])
.await?;
let mut final_message: Option<ChatMessage> = None;
@@ -1459,6 +1291,34 @@ fn resolve_date_taken_for_context(
.map(|dt| dt.format("%Y-%m-%d").to_string())
}
/// Validate a stored→effective backend transition for a chat continuation.
/// Continuation runs against a transcript that was generated with a specific
/// backend; the only blocked transition is `local → hybrid`, because the
/// stored transcript has images embedded in the first user message and the
/// hybrid path (OpenRouter chat with describe-then-inline) can't replay
/// raw image bytes through OpenRouter consistently across providers.
/// `hybrid → local` is allowed (the inlined description replays verbatim
/// as text).
///
/// Whether "local" routes through Ollama or llama-swap is decided at
/// startup by `LLM_BACKEND`; both share the same transcript shape from
/// the chat-replay perspective.
fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
if !matches!(effective, "local" | "hybrid") {
bail!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
effective
);
}
if stored == "local" && effective == "hybrid" {
bail!(
"switching from local to hybrid mid-chat isn't supported; \
regenerate the insight in hybrid mode if you want OpenRouter chat"
);
}
Ok(())
}
/// Pick the backend label for bootstrap. Bootstrap has no stored insight
/// to defer to (that's continuation's behaviour), so the default is
/// `"local"`. Returns an error if the supplied label is non-empty but
@@ -2082,10 +1942,40 @@ mod tests {
#[test]
fn bootstrap_backend_rejects_unknown_label() {
let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err();
// `llamacpp` is no longer a per-request backend value — it's chosen
// at deploy time via `LLM_BACKEND`.
for label in &["openrouter", "llamacpp", "ollama"] {
let err = resolve_bootstrap_backend(Some(label)).unwrap_err();
let msg = format!("{}", err);
assert!(msg.contains("unknown backend"));
assert!(msg.contains("openrouter"));
assert!(msg.contains("unknown backend"), "label={}", label);
}
}
#[test]
fn cross_replay_rejects_local_to_hybrid() {
let err = validate_cross_replay("local", "hybrid").unwrap_err();
assert!(format!("{}", err).contains("local to hybrid"));
}
#[test]
fn cross_replay_allows_supported_transitions() {
assert!(validate_cross_replay("local", "local").is_ok());
assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
// Hybrid → local replays the inlined description as plain text.
assert!(validate_cross_replay("hybrid", "local").is_ok());
}
#[test]
fn cross_replay_rejects_unknown_effective() {
// Both "openrouter" and the former "llamacpp" value are unknown now.
for label in &["openrouter", "llamacpp"] {
let err = validate_cross_replay("local", label).unwrap_err();
assert!(
format!("{}", err).contains("unknown backend"),
"label={}",
label
);
}
}
#[test]

View File

@@ -10,6 +10,8 @@ use std::io::Cursor;
use std::sync::{Arc, Mutex};
use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::llm_client::LlmClient;
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
use crate::ai::openrouter::OpenRouterClient;
@@ -26,6 +28,42 @@ use crate::otel::global_tracer;
use crate::tags::TagDao;
use crate::utils::{earliest_fs_time, normalize_path};
/// Parse a "Title: ...\n\n<body>" response into (title, body).
/// Falls back to the first sentence as the title if the model didn't
/// follow the format.
pub(crate) fn parse_title_body(raw: &str) -> (String, String) {
let trimmed = raw.trim();
// Try "Title: <title>\n\n<body>" or "Title: <title>\n<body>"
if let Some(rest) = trimmed
.strip_prefix("Title:")
.or_else(|| trimmed.strip_prefix("title:"))
{
let rest = rest.trim_start();
if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) {
let title = rest[..split_pos].trim();
let body = rest[split_pos..].trim();
if !title.is_empty() && !body.is_empty() {
return (title.to_string(), body.to_string());
}
}
}
// Fallback: first sentence (up to first `. ` or `.\n`) becomes the title
if let Some(pos) = trimmed.find(". ").or_else(|| trimmed.find(".\n")) {
let title = &trimmed[..pos];
let body = trimmed[pos + 1..].trim();
if title.len() <= 100 && !body.is_empty() {
return (title.to_string(), body.to_string());
}
}
// Last resort: truncate to 60 chars for title, full text as body
let title: String = trimmed.chars().take(60).collect();
let title = title.trim_end().to_string();
(title, trimmed.to_string())
}
/// Combine an optional personal Apollo Place with an optional Nominatim
/// reverse-geocoded city, falling back to bare coordinates when neither
/// resolves. Free function so we can test it cheaply without spinning up
@@ -68,6 +106,9 @@ pub struct InsightGenerator {
/// Optional OpenRouter client, used when `backend=hybrid` is requested.
/// `None` when `OPENROUTER_API_KEY` is not configured.
openrouter: Option<Arc<OpenRouterClient>>,
/// Optional llama-swap client, used when `backend=llamacpp` is requested.
/// `None` when `LLAMA_SWAP_URL` is not configured.
llamacpp: Option<Arc<LlamaCppClient>>,
sms_client: SmsApiClient,
/// Optional integration with Apollo's user-defined Places. When the
/// integration is disabled (`APOLLO_API_BASE_URL` unset), every
@@ -120,6 +161,7 @@ impl InsightGenerator {
pub fn new(
ollama: OllamaClient,
openrouter: Option<Arc<OpenRouterClient>>,
llamacpp: Option<Arc<LlamaCppClient>>,
sms_client: SmsApiClient,
apollo_client: ApolloClient,
insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
@@ -137,6 +179,7 @@ impl InsightGenerator {
Self {
ollama,
openrouter,
llamacpp,
sms_client,
apollo_client,
insight_dao,
@@ -465,8 +508,11 @@ impl InsightGenerator {
log::info!("RAG QUERY: {}", query);
log::info!("========================================");
// Generate embedding for the query
let query_embedding = self.ollama.generate_embedding(&query).await?;
// Generate embedding for the query via the configured local backend
// (`LLM_BACKEND` switch). Must match the backend that populated the
// daily-summary embeddings or similarity search will be garbage.
let query_embedding =
crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?;
// Search for similar daily summaries with time-based weighting
// This prioritizes summaries temporally close to the query date
@@ -557,7 +603,7 @@ impl InsightGenerator {
let calendar_cx = parent_cx.with_span(span);
let query_embedding = if let Some(loc) = location {
match self.ollama.generate_embedding(loc).await {
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await {
Ok(emb) => Some(emb),
Err(e) => {
log::warn!("Failed to generate embedding for location '{}': {}", loc, e);
@@ -728,7 +774,8 @@ impl InsightGenerator {
)
};
let query_embedding = match self.ollama.generate_embedding(&query_text).await {
let query_embedding =
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await {
Ok(emb) => emb,
Err(e) => {
log::warn!("Failed to generate search embedding: {}", e);
@@ -1583,29 +1630,27 @@ Return ONLY the summary, nothing else."#,
&self,
tool_name: &str,
arguments: &serde_json::Value,
ollama: &OllamaClient,
backend: &ResolvedBackend,
image_base64: &Option<String>,
file_path: &str,
user_id: i32,
persona_id: &str,
// Provenance — written into entity_facts.created_by_* when
// the loop calls store_fact. The caller knows the actual
// chat-runtime model and backend (which may differ from
// ollama.primary_model in hybrid mode where chat lives on
// OpenRouter while Ollama still handles vision).
model: &str,
backend: &str,
cx: &opentelemetry::Context,
) -> String {
let model = backend.model();
let backend_label = backend.kind.as_str();
let result = match tool_name {
"search_rag" => self.tool_search_rag(arguments, ollama, cx).await,
"search_rag" => self.tool_search_rag(arguments, backend.local(), cx).await,
"search_messages" => self.tool_search_messages(arguments, cx).await,
"get_sms_messages" => self.tool_get_sms_messages(arguments, cx).await,
"get_calendar_events" => self.tool_get_calendar_events(arguments, cx).await,
"get_location_history" => self.tool_get_location_history(arguments, cx).await,
"get_file_tags" => self.tool_get_file_tags(arguments, cx).await,
"get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await,
"describe_photo" => self.tool_describe_photo(ollama, image_base64).await,
"describe_photo" => {
self.tool_describe_photo(backend.local(), image_base64)
.await
}
"reverse_geocode" => self.tool_reverse_geocode(arguments).await,
"get_personal_place_at" => self.tool_get_personal_place_at(arguments).await,
"recall_entities" => self.tool_recall_entities(arguments, cx).await,
@@ -1613,19 +1658,25 @@ Return ONLY the summary, nothing else."#,
self.tool_recall_facts_for_photo(arguments, user_id, persona_id, cx)
.await
}
"store_entity" => self.tool_store_entity(arguments, ollama, cx).await,
"store_entity" => self.tool_store_entity(arguments, cx).await,
"store_fact" => {
self.tool_store_fact(
arguments, file_path, user_id, persona_id, model, backend, cx,
arguments,
file_path,
user_id,
persona_id,
model,
backend_label,
cx,
)
.await
}
"update_fact" => {
self.tool_update_fact(arguments, user_id, persona_id, model, backend, cx)
self.tool_update_fact(arguments, user_id, persona_id, model, backend_label, cx)
.await
}
"supersede_fact" => {
self.tool_supersede_fact(arguments, user_id, persona_id, model, backend, cx)
self.tool_supersede_fact(arguments, user_id, persona_id, model, backend_label, cx)
.await
}
"get_current_datetime" => Self::tool_get_current_datetime(),
@@ -1643,7 +1694,7 @@ Return ONLY the summary, nothing else."#,
async fn tool_search_rag(
&self,
args: &serde_json::Value,
ollama: &OllamaClient,
local: &dyn LlmClient,
_cx: &opentelemetry::Context,
) -> String {
let query = match args.get("query").and_then(|v| v.as_str()) {
@@ -1707,7 +1758,7 @@ Return ONLY the summary, nothing else."#,
};
let final_results = if rerank_enabled && results.len() > limit {
match self.rerank_with_llm(&query, &results, limit, ollama).await {
match self.rerank_with_llm(&query, &results, limit, local).await {
Ok(reordered) => reordered,
Err(e) => {
log::warn!("rerank failed, using vector order: {}", e);
@@ -1733,7 +1784,7 @@ Return ONLY the summary, nothing else."#,
query: &str,
candidates: &[String],
limit: usize,
ollama: &OllamaClient,
local: &dyn LlmClient,
) -> Result<Vec<String>> {
let query_preview: String = query.chars().take(60).collect();
log::info!(
@@ -1771,14 +1822,9 @@ Return ONLY the summary, nothing else."#,
);
let started = std::time::Instant::now();
let response = ollama
.generate_no_think(
&prompt,
Some(
"You are a terse relevance ranker. You output only numbers separated by commas.",
),
)
.await?;
let system =
Some("You are a terse relevance ranker. You output only numbers separated by commas.");
let response = local.generate(&prompt, system, None).await?;
log::info!(
"rerank: finished in {} ms (prompt={} chars)",
started.elapsed().as_millis(),
@@ -1925,6 +1971,12 @@ Return ONLY the summary, nothing else."#,
.unwrap_or(20)
.clamp(1, 50) as usize;
let contact_id = args.get("contact_id").and_then(|v| v.as_i64());
let contact = args
.get("contact")
.and_then(|v| v.as_str())
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.filter(|_| contact_id.is_none());
let start_ts = args.get("start_ts").and_then(|v| v.as_i64());
let end_ts = args.get("end_ts").and_then(|v| v.as_i64());
let is_mms = args.get("is_mms").and_then(|v| v.as_bool());
@@ -1932,10 +1984,11 @@ Return ONLY the summary, nothing else."#,
let has_date_filter = start_ts.is_some() || end_ts.is_some();
log::info!(
"tool_search_messages: query='{}', mode={}, contact_id={:?}, range=[{:?}, {:?}], is_mms={:?}, has_media={:?}, limit={}",
"tool_search_messages: query='{}', mode={}, contact_id={:?}, contact={:?}, range=[{:?}, {:?}], is_mms={:?}, has_media={:?}, limit={}",
query,
mode,
contact_id,
contact,
start_ts,
end_ts,
is_mms,
@@ -1950,6 +2003,7 @@ Return ONLY the summary, nothing else."#,
mode: mode.as_str(),
limit: user_limit,
contact_id,
contact,
date_from: start_ts,
date_to: end_ts,
is_mms,
@@ -2350,16 +2404,14 @@ Return ONLY the summary, nothing else."#,
out
}
/// Tool: describe_photo — generate a visual description of the photo
async fn tool_describe_photo(
&self,
ollama: &OllamaClient,
local: &dyn LlmClient,
image_base64: &Option<String>,
) -> String {
log::info!("tool_describe_photo: generating visual description");
match image_base64 {
Some(img) => match ollama.generate_photo_description(img).await {
Some(img) => match local.describe_image(img).await {
Ok(desc) => desc,
Err(e) => format!("Error describing photo: {}", e),
},
@@ -2602,11 +2654,12 @@ Return ONLY the summary, nothing else."#,
}
}
/// Tool: store_entity — upsert an entity into the knowledge memory
/// Tool: store_entity — upsert an entity into the knowledge memory.
/// Embeddings go through the configured local backend (`LLM_BACKEND`),
/// independent of the per-request chat backend in the caller.
async fn tool_store_entity(
&self,
args: &serde_json::Value,
ollama: &OllamaClient,
cx: &opentelemetry::Context,
) -> String {
use crate::database::models::InsertEntity;
@@ -2666,9 +2719,11 @@ Return ONLY the summary, nothing else."#,
.collect()
};
// Generate embedding for name + description (best-effort)
// Generate embedding for name + description (best-effort) via the
// configured local backend.
let embed_text = format!("{} {}", name, description);
let embedding: Option<Vec<u8>> = match ollama.generate_embedding(&embed_text).await {
let embedding: Option<Vec<u8>> =
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await {
Ok(vec) => {
let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
Some(bytes)
@@ -3025,14 +3080,16 @@ Return ONLY the summary, nothing else."#,
}
tools.push(Tool::function(
"search_messages",
"search_messages",
"Search SMS/MMS messages — bodies and (for MMS) attachment text + filenames. \
Modes: `fts5` (keyword + phrase + prefix + AND/OR/NOT + NEAR proximity), \
`semantic` (embedding similarity, requires generated embeddings), `hybrid` (RRF merge, recommended; \
degrades to fts5 when embeddings absent). Optional filters: `start_ts` / `end_ts` (real-UTC unix \
seconds), `contact_id`, `is_mms` (true = MMS only, false = SMS only), `has_media` (true = messages \
with image/video/audio attachments only). For pure date / contact browsing without keywords, prefer \
`get_sms_messages`. \
seconds), `contact` (contact name, case-insensitive), `contact_id` (numeric), `is_mms` \
(true = MMS only, false = SMS only), `has_media` (true = messages with image/video/audio \
attachments only). Prefer `contact` over `contact_id` — the name is resolved server-side. \
If both are provided, `contact_id` takes precedence. \
For pure date / contact browsing without keywords, prefer `get_sms_messages`. \
\n\nFTS5 query syntax (works in fts5 + hybrid modes):\n\
- Phrase: `\"trader joe's\"` — exact word sequence (use double quotes).\n\
- Prefix: `restaur*` — matches restaurant, restaurants, restauranteur, ….\n\
@@ -3042,6 +3099,7 @@ Return ONLY the summary, nothing else."#,
Unquoted multi-word queries are treated as implicit AND. Apostrophes / hyphens / colons are safe — they no longer downgrade to a slow LIKE scan. Use `mode: \"fts5\"` when you want the operators above to be authoritative; `hybrid` still respects them but may surface semantically-similar non-keyword hits alongside.\n\n\
Examples:\n\
- `{query: \"trader joe's\"}` — phrase across all time.\n\
- `{query: \"dinner\", contact: \"Mom\"}` — keyword scoped to Mom's messages.\n\
- `{query: \"dinner\", contact_id: 42, start_ts: 1700000000, end_ts: 1700604800}` — keyword within a contact and a week.\n\
- `{query: \"vacation\", has_media: true}` — only matches that include photos / videos.\n\
- `{query: \"wedding OR reception OR ceremony\", mode: \"fts5\"}` — any of several synonyms.\n\
@@ -3055,6 +3113,7 @@ Return ONLY the summary, nothing else."#,
"mode": { "type": "string", "enum": ["fts5", "semantic", "hybrid"], "description": "Search strategy. Default: hybrid." },
"limit": { "type": "integer", "description": "Max results (default 20, max 50)." },
"contact_id": { "type": "integer", "description": "Optional numeric contact id to scope the search." },
"contact": { "type": "string", "description": "Optional contact name (case-insensitive). Resolved to contact_id server-side. Use this when you know the name but not the ID." },
"start_ts": { "type": "integer", "description": "Optional inclusive lower bound, real-UTC unix seconds." },
"end_ts": { "type": "integer", "description": "Optional inclusive upper bound, real-UTC unix seconds." },
"is_mms": { "type": "boolean", "description": "Optional: true to restrict to MMS, false to restrict to SMS." },
@@ -3526,7 +3585,8 @@ Return ONLY the summary, nothing else."#,
- When you identify people / places / events / things, use store_entity + store_fact to grow the persistent memory.\n\
- Before store_entity, call recall_entities to check whether a similar name already exists; reuse the existing entity_id rather than creating a near-duplicate (e.g. \"Sara\" vs \"Sarah J.\"). The DAO will collapse obvious cosine matches, but choosing the existing id keeps facts and photo links consolidated.\n\
- Predicates should be relationship-shaped verbs that encode a queryable claim — `lives_in`, `works_at`, `attended`, `is_friend_of`, `is_parent_of`, `interested_in`, `married_to`, `owns`. DO NOT use vague speech-act predicates like `expressed`, `said`, `mentioned`, `stated`, `quoted`, `noted`, `discussed`, `thought`, `wondered`. DO NOT store quotations or sentence fragments as `object_value` — paraphrase into a structured claim. Bad: `(Cameron, expressed, \"I'm tempted to get a part-time job there\")`. Good: `(Cameron, considered_employment_at, <Place>)` or `(Cameron, interested_in, \"part-time work\")`.\n\
- A tool returning no results is informative; continue with the others.",
- A tool returning no results is informative; continue with the others.\n\
- When writing your final answer, start with \"Title: <short title>\" (max 8 words) on the first line, then a blank line, then the body.",
);
let mut out = identity;
@@ -3541,6 +3601,184 @@ Return ONLY the summary, nothing else."#,
out
}
/// Consolidate client construction for the agentic insight loop.
///
/// Returns a [`ResolvedBackend`] containing the **chat** client (the model
/// that drives the agent loop), the **local** client (always the configured
/// local backend — Ollama or llama-swap — for utility calls like
/// describe_image, rerank, embeddings), the backend kind, and whether the
/// chat model receives images inline.
pub async fn resolve_backend(
&self,
kind: BackendKind,
overrides: &SamplingOverrides,
) -> Result<ResolvedBackend> {
let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let is_hybrid = kind == BackendKind::Hybrid;
// ── chat client ────────────────────────────────────────────────
let chat: Box<dyn LlmClient> = if is_hybrid {
// Hybrid: chat through OpenRouter.
let arc = self.openrouter.as_ref().ok_or_else(|| {
anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
})?;
let mut c: OpenRouterClient = (**arc).clone();
if let Some(ref m) = overrides.model {
c.primary_model = m.clone();
}
if overrides.has_sampling() {
c.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
c.set_num_ctx(Some(ctx));
}
Box::new(c)
} else if local_via_llamacpp {
// Local via llama-swap.
let arc = self.llamacpp.as_ref().ok_or_else(|| {
anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
})?;
let mut c: LlamaCppClient = (**arc).clone();
if let Some(ref m) = overrides.model {
c.primary_model = m.clone();
}
if overrides.has_sampling() {
c.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
c.set_num_ctx(Some(ctx));
}
Box::new(c)
} else {
// Pure Ollama local.
let mut ollama_client = if let Some(ref model) = overrides.model {
OllamaClient::new(
self.ollama.primary_url.clone(),
self.ollama.fallback_url.clone(),
model.clone(),
Some(model.clone()),
)
} else {
self.ollama.clone()
};
if overrides.has_sampling() {
ollama_client.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
ollama_client.set_num_ctx(Some(ctx));
}
Box::new(ollama_client)
};
// ── local client (utility calls: rerank, describe_image, etc.) ─
// For llamacpp in local mode: mirror the chat model selection so
// rerank / describe_image hit the same model that's already
// loaded — avoids a mid-turn model swap in llama-swap exclusive
// mode. In hybrid mode the override is an OpenRouter model id
// (e.g. "google/gemini-3-flash-preview") which llama-swap can't
// serve — keep the configured local slots.
let local: Box<dyn LlmClient> = if local_via_llamacpp {
let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone();
if !is_hybrid && let Some(ref m) = overrides.model {
lc.primary_model = m.clone();
lc.set_vision_model(m.clone());
}
Box::new(lc)
} else {
Box::new(self.ollama.clone())
};
// ── images_inline ──────────────────────────────────────────────
let images_inline = if is_hybrid {
// Hybrid: chat model never sees images — describe-then-inject.
false
} else if local_via_llamacpp {
// llama-swap models receive images directly via OpenAI content
// parts. Capability probing isn't available (no `/api/show`),
// so assume vision support; a misconfigured model surfaces as
// a chat-call error.
true
} else {
// Pure Ollama: probe model capabilities.
let ollama_for_caps = if let Some(ref model) = overrides.model {
// Verify custom model is available on at least one server.
let available_on_primary =
OllamaClient::is_model_available(&self.ollama.primary_url, model)
.await
.unwrap_or(false);
let available_on_fallback = if let Some(ref fallback_url) = self.ollama.fallback_url
{
OllamaClient::is_model_available(fallback_url, model)
.await
.unwrap_or(false)
} else {
false
};
if !available_on_primary && !available_on_fallback {
anyhow::bail!(
"model not available: '{}' not found on any configured server",
model
);
}
model.as_str()
} else {
self.ollama.primary_model.as_str()
};
let capabilities = match OllamaClient::check_model_capabilities(
&self.ollama.primary_url,
ollama_for_caps,
)
.await
{
Ok(caps) => caps,
Err(_) => {
let fallback_url =
self.ollama.fallback_url.as_deref().ok_or_else(|| {
anyhow::anyhow!(
"Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
ollama_for_caps
)
})?;
OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps)
.await
.map_err(|e| {
anyhow::anyhow!(
"Failed to check model capabilities for '{}': {}",
ollama_for_caps,
e
)
})?
}
};
if !capabilities.has_tool_calling {
anyhow::bail!("tool calling not supported by model '{}'", ollama_for_caps);
}
capabilities.has_vision
};
Ok(ResolvedBackend::new(chat, local, kind, images_inline))
}
pub async fn generate_agentic_insight_for_photo(
&self,
file_path: &str,
@@ -3568,192 +3806,23 @@ Return ONLY the summary, nothing else."#,
span.set_attribute(KeyValue::new("file_path", file_path.clone()));
span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
// 1a. Resolve backend label (defaults to "local").
let backend_label = backend
.as_deref()
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| "local".to_string());
if !matches!(backend_label.as_str(), "local" | "hybrid") {
return Err(anyhow::anyhow!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
backend_label
));
}
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
let is_hybrid = backend_label == "hybrid";
// 1b. Always build an Ollama client. In local mode it owns the chat
// loop; in hybrid mode it still handles describe_image + any
// tool-local calls (e.g. if a future tool needs embeddings).
// Sampling overrides only apply in local mode — in hybrid the
// user's params belong to the OpenRouter chat client.
let apply_sampling_to_ollama = !is_hybrid;
let mut ollama_client = if let Some(ref model) = custom_model
&& !is_hybrid
{
log::info!("Using custom model for agentic: {}", model);
span.set_attribute(KeyValue::new("custom_model", model.clone()));
OllamaClient::new(
self.ollama.primary_url.clone(),
self.ollama.fallback_url.clone(),
model.clone(),
Some(model.clone()),
)
} else {
if !is_hybrid {
span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
}
self.ollama.clone()
};
if apply_sampling_to_ollama {
if let Some(ctx) = num_ctx {
log::info!("Using custom context size: {}", ctx);
span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
ollama_client.set_num_ctx(Some(ctx));
}
if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
log::info!(
"Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}",
// 1. Resolve backend + build clients.
let kind = BackendKind::parse(backend.as_deref().unwrap_or("local"))?;
span.set_attribute(KeyValue::new("backend", kind.as_str()));
let overrides = SamplingOverrides {
model: custom_model,
num_ctx,
temperature,
top_p,
top_k,
min_p
);
if let Some(t) = temperature {
span.set_attribute(KeyValue::new("temperature", t as f64));
}
if let Some(p) = top_p {
span.set_attribute(KeyValue::new("top_p", p as f64));
}
if let Some(k) = top_k {
span.set_attribute(KeyValue::new("top_k", k as i64));
}
if let Some(m) = min_p {
span.set_attribute(KeyValue::new("min_p", m as f64));
}
ollama_client.set_sampling_params(temperature, top_p, top_k, min_p);
}
}
// 1c. In hybrid mode, clone the configured OpenRouter client and
// apply per-request overrides.
let openrouter_client: Option<OpenRouterClient> = if is_hybrid {
let arc = self.openrouter.as_ref().ok_or_else(|| {
anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
})?;
let mut c: OpenRouterClient = (**arc).clone();
if let Some(ref m) = custom_model {
c.primary_model = m.clone();
span.set_attribute(KeyValue::new("custom_model", m.clone()));
}
span.set_attribute(KeyValue::new("openrouter_model", c.primary_model.clone()));
if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
if let Some(t) = temperature {
span.set_attribute(KeyValue::new("temperature", t as f64));
}
if let Some(p) = top_p {
span.set_attribute(KeyValue::new("top_p", p as f64));
}
if let Some(k) = top_k {
span.set_attribute(KeyValue::new("top_k", k as i64));
}
if let Some(m) = min_p {
span.set_attribute(KeyValue::new("min_p", m as f64));
}
c.set_sampling_params(temperature, top_p, top_k, min_p);
}
if let Some(ctx) = num_ctx {
span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
c.set_num_ctx(Some(ctx));
}
Some(c)
} else {
None
min_p,
};
let backend = self.resolve_backend(kind, &overrides).await?;
span.set_attribute(KeyValue::new("model", backend.model().to_string()));
span.set_attribute(KeyValue::new("images_inline", backend.images_inline));
let insight_cx = current_cx.with_span(span);
// 2. Verify chat model supports tool calling.
// - local: existing Ollama model availability + capability check.
// - hybrid: trust the operator's curated allowlist
// (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
// surfaces as a chat-call error on the next step.
let has_vision = if is_hybrid {
// In hybrid mode the chat model never sees images directly — we
// describe-then-inject, so `has_vision` drives only whether we
// bother loading the image to describe it, which we always do.
true
} else {
if let Some(ref model_name) = custom_model {
let available_on_primary =
OllamaClient::is_model_available(&ollama_client.primary_url, model_name)
.await
.unwrap_or(false);
let available_on_fallback =
if let Some(ref fallback_url) = ollama_client.fallback_url {
OllamaClient::is_model_available(fallback_url, model_name)
.await
.unwrap_or(false)
} else {
false
};
if !available_on_primary && !available_on_fallback {
anyhow::bail!(
"model not available: '{}' not found on any configured server",
model_name
);
}
}
let model_name_for_caps = &ollama_client.primary_model;
let capabilities = match OllamaClient::check_model_capabilities(
&ollama_client.primary_url,
model_name_for_caps,
)
.await
{
Ok(caps) => caps,
Err(_) => {
let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| {
anyhow::anyhow!(
"Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
model_name_for_caps
)
})?;
OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps)
.await
.map_err(|e| {
anyhow::anyhow!(
"Failed to check model capabilities for '{}': {}",
model_name_for_caps,
e
)
})?
}
};
if !capabilities.has_tool_calling {
return Err(anyhow::anyhow!(
"tool calling not supported by model '{}'",
ollama_client.primary_model
));
}
insight_cx
.span()
.set_attribute(KeyValue::new("model_has_vision", capabilities.has_vision));
insight_cx
.span()
.set_attribute(KeyValue::new("model_has_tool_calling", true));
capabilities.has_vision
};
// 3. Fetch EXIF
let exif = {
let mut exif_dao = self.exif_dao.lock().expect("Unable to lock ExifDao");
@@ -3845,38 +3914,32 @@ Return ONLY the summary, nothing else."#,
}
};
// 7. Load image if vision capable.
// In hybrid mode we ALSO describe it locally now so the
// description can be inlined as text — the OpenRouter chat model
// never receives the base64 image directly.
let image_base64 = if has_vision {
match self.load_image_as_base64(&file_path) {
// 7. Load image. Always attempted — vision-capable models get the
// base64 inline; hybrid mode describes it locally and injects text.
let image_base64 = match self.load_image_as_base64(&file_path) {
Ok(b64) => {
log::info!("Loaded image for vision-capable agentic model");
log::info!("Loaded image for agentic model");
Some(b64)
}
Err(e) => {
log::warn!("Failed to load image for agentic vision: {}", e);
log::warn!("Failed to load image for agentic: {}", e);
None
}
}
} else {
None
};
let hybrid_visual_description: Option<String> = if is_hybrid {
// Describe-then-inline (hybrid only). Vision describe routes through
// the local backend so non-text work stays off OpenRouter.
let inlined_visual_description: Option<String> = if !backend.images_inline {
match image_base64.as_deref() {
Some(b64) => match self.ollama.describe_image(b64).await {
Some(b64) => match backend.local().describe_image(b64).await {
Ok(desc) => {
log::info!(
"Hybrid: local vision describe succeeded ({} chars)",
desc.len()
);
log::info!("{}: vision describe succeeded ({} chars)", kind, desc.len());
Some(desc)
}
Err(e) => {
log::warn!(
"Hybrid: local vision describe failed, continuing without: {}",
"{}: vision describe failed, continuing without: {}",
kind,
e
);
None
@@ -3934,7 +3997,7 @@ Return ONLY the summary, nothing else."#,
.map(|c| format!("Contact/Person: {}", c))
.unwrap_or_else(|| "Contact/Person: unknown".to_string());
let visual_block = hybrid_visual_description
let visual_block = inlined_visual_description
.as_deref()
.map(|d| format!("Visual description (from local vision model):\n{}\n\n", d))
.unwrap_or_default();
@@ -3953,31 +4016,24 @@ Return ONLY the summary, nothing else."#,
date = date_taken.format("%B %d, %Y"),
);
// 10. Define tools. Gate flags computed from current data presence;
// hybrid mode omits describe_photo since the chat model receives
// the visual description inline (so we pass `false` for has_vision
// in hybrid mode regardless of the model's actual capability).
let gate_opts = self.current_gate_opts(has_vision && !is_hybrid);
// 10. Define tools. describe_photo offered only when the chat model
// sees images directly (images_inline); in hybrid mode the visual
// description is already inlined as text.
let gate_opts = self.current_gate_opts(backend.images_inline);
let tools = Self::build_tool_definitions(gate_opts);
// 11. Build initial messages. In hybrid mode images are never
// attached to the wire message — the description is part of
// `user_content`.
// 11. Build initial messages. images_inline → attach base64 to the
// user message; describe-then-inline → text was already injected.
let system_msg = ChatMessage::system(system_content);
let mut user_msg = ChatMessage::user(user_content);
if !is_hybrid && let Some(ref img) = image_base64 {
if backend.images_inline {
if let Some(ref img) = image_base64 {
user_msg.images = Some(vec![img.clone()]);
}
}
let mut messages = vec![system_msg, user_msg];
// 12. Agentic loop — dispatch through the selected backend.
let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client {
or_c
} else {
&ollama_client
};
let loop_span = tracer.start_with_context("ai.agentic.loop", &insight_cx);
let loop_cx = insight_cx.with_span(loop_span);
@@ -3990,7 +4046,8 @@ Return ONLY the summary, nothing else."#,
iterations_used = iteration + 1;
log::info!("Agentic iteration {}/{}", iteration + 1, max_iterations);
let (response, prompt_tokens, eval_tokens) = chat_backend
let (response, prompt_tokens, eval_tokens) = backend
.chat()
.chat_with_tools(messages.clone(), tools.clone())
.await?;
@@ -4030,13 +4087,11 @@ Return ONLY the summary, nothing else."#,
.execute_tool(
&tool_call.function.name,
&tool_call.function.arguments,
&ollama_client,
&backend,
&image_base64,
&file_path,
user_id,
&persona_id,
chat_backend.primary_model(),
&backend_label,
&loop_cx,
)
.await;
@@ -4057,10 +4112,11 @@ Return ONLY the summary, nothing else."#,
iterations_used
);
messages.push(ChatMessage::user(format!(
"Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.",
"Based on the context gathered, please write the final photo insight. Start with \"Title: <short title>\" on the first line (max 8 words), then a blank line, then the detailed personal summary. Write in first person as {}.",
user_display_name()
)));
let (final_response, prompt_tokens, eval_tokens) = chat_backend
let (final_response, prompt_tokens, eval_tokens) = backend
.chat()
.chat_with_tools(messages.clone(), vec![])
.await?;
last_prompt_eval_count = prompt_tokens;
@@ -4074,20 +4130,11 @@ Return ONLY the summary, nothing else."#,
.set_attribute(KeyValue::new("iterations_used", iterations_used as i64));
loop_cx.span().set_status(Status::Ok);
// 13. Generate title via the same backend so voice stays consistent.
let title_prompt = format!(
"Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\nCapture the key moment or theme. Return ONLY the title, nothing else.",
final_content
);
let title_system = custom_system_prompt.as_deref().unwrap_or(
"You are my long term memory assistant. Use only the information provided. Do not invent details.",
);
let title_raw = chat_backend
.generate(&title_prompt, Some(title_system), None)
.await?;
let title = title_raw.trim().trim_matches('"').to_string();
// 13. Parse title from the model's inline response.
let (title, body) = parse_title_body(&final_content);
final_content = body;
log::info!("Agentic generated title: {}", title);
log::info!("Agentic parsed title: {}", title);
let summary_preview: String = final_content.chars().take(200).collect();
log::info!(
"Agentic generated summary ({} chars): {}",
@@ -4105,7 +4152,7 @@ Return ONLY the summary, nothing else."#,
};
// 15. Store insight (returns the persisted row including its new id)
let model_version = chat_backend.primary_model().to_string();
let model_version = backend.model().to_string();
let fewshot_source_ids_json = if fewshot_source_ids.is_empty() {
None
} else {
@@ -4120,7 +4167,7 @@ Return ONLY the summary, nothing else."#,
model_version,
is_current: true,
training_messages,
backend: backend_label.clone(),
backend: kind.as_str().to_string(),
fewshot_source_ids: fewshot_source_ids_json,
content_hash: None,
};
@@ -4738,4 +4785,78 @@ mod tests {
assert!(out.contains("-> empty (pivoted)"));
assert!(out.contains("Final insight: Final title"));
}
#[test]
fn parse_title_body_standard_format() {
let (t, b) = parse_title_body("Title: Summer at the Lake\n\nWe spent the afternoon...");
assert_eq!(t, "Summer at the Lake");
assert_eq!(b, "We spent the afternoon...");
}
#[test]
fn parse_title_body_single_newline() {
let (t, b) = parse_title_body("Title: Morning Walk\nThe sun was rising...");
assert_eq!(t, "Morning Walk");
assert_eq!(b, "The sun was rising...");
}
#[test]
fn parse_title_body_lowercase_prefix() {
let (t, b) = parse_title_body("title: Garden Party\n\nEveryone gathered...");
assert_eq!(t, "Garden Party");
assert_eq!(b, "Everyone gathered...");
}
#[test]
fn parse_title_body_fallback_first_sentence() {
let (t, b) = parse_title_body("A warm summer day. We gathered at the park for a picnic.");
assert_eq!(t, "A warm summer day");
assert_eq!(b, "We gathered at the park for a picnic.");
}
#[test]
fn parse_title_body_fallback_truncate() {
let input = "A single long paragraph with no periods or title prefix that just keeps going on and on";
let (t, b) = parse_title_body(input);
assert!(t.len() <= 60);
assert_eq!(b, input);
}
/// Regression: hybrid mode was leaking the OpenRouter model override
/// into the local llamacpp client, causing describe_image to send
/// e.g. "google/gemini-3-flash-preview" to llama-swap (which 400s).
#[test]
fn resolve_backend_hybrid_does_not_leak_model_to_local_llamacpp() {
use crate::ai::llamacpp::LlamaCppClient;
let mut base =
LlamaCppClient::new(Some("http://localhost:9292/v1".into()), Some("chat".into()));
base.set_vision_model("vision".into());
base.set_embedding_model("embed".into());
let openrouter_model = "google/gemini-3-flash-preview";
let overrides_model: Option<String> = Some(openrouter_model.into());
let is_hybrid = true;
// Replicate the resolve_backend local-client construction
// (lines ~3686-3695 of this file).
let mut lc = base.clone();
if let Some(ref m) = overrides_model {
if !is_hybrid {
lc.primary_model = m.clone();
lc.set_vision_model(m.clone());
}
}
// In hybrid mode the local client must keep its configured slots.
assert_eq!(
lc.vision_model, "vision",
"hybrid mode must not override vision_model with OpenRouter model"
);
assert_eq!(
lc.primary_model, "chat",
"hybrid mode must not override primary_model with OpenRouter model"
);
assert_eq!(lc.embedding_model, "embed");
}
}

1143
src/ai/llamacpp.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,10 +1,12 @@
pub mod apollo_client;
pub mod backend;
pub mod clip_client;
pub mod daily_summary_job;
pub mod face_client;
pub mod handlers;
pub mod insight_chat;
pub mod insight_generator;
pub mod llamacpp;
pub mod llm_client;
pub mod ollama;
pub mod openrouter;
@@ -23,6 +25,7 @@ pub use handlers::{
get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
};
pub use insight_generator::InsightGenerator;
pub use llamacpp::LlamaCppClient;
#[allow(unused_imports)]
pub use llm_client::{
ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
@@ -38,3 +41,87 @@ pub use sms_client::{SmsApiClient, SmsMessage};
pub fn user_display_name() -> String {
std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string())
}
/// One switch for the "local" LLM stack: when `LLM_BACKEND=llamacpp` is
/// set, chat / vision describe / embeddings all route through llama-swap
/// instead of Ollama. Any other value (including unset, the default) is
/// Ollama. This is intentionally global — embeddings must be drawn from
/// a single source or similarity search across the index breaks (mixed
/// vector spaces, possibly mixed dims). The `backend=hybrid` per-request
/// override remains orthogonal: it always sends chat to OpenRouter, and
/// uses `LLM_BACKEND` for the describe-then-inline vision pass.
pub fn local_backend_is_llamacpp() -> bool {
matches!(
std::env::var("LLM_BACKEND")
.ok()
.as_deref()
.map(|s| s.trim().to_lowercase())
.as_deref(),
Some("llamacpp")
)
}
/// Embed one string via the configured local backend. Routes through
/// llama-swap when `LLM_BACKEND=llamacpp` (and a client is configured),
/// else Ollama. Returns the single embedding vector. See
/// [`local_backend_is_llamacpp`] for the rationale on consistency.
pub async fn embed_one(
ollama: &OllamaClient,
llamacpp: Option<&LlamaCppClient>,
text: &str,
) -> anyhow::Result<Vec<f32>> {
if local_backend_is_llamacpp() {
if let Some(lc) = llamacpp {
let mut vecs = <LlamaCppClient as LlmClient>::generate_embeddings(lc, &[text]).await?;
return vecs
.pop()
.ok_or_else(|| anyhow::anyhow!("llama-swap returned no embeddings"));
}
log::warn!(
"LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured; falling back to Ollama embeddings"
);
}
ollama.generate_embedding(text).await
}
#[cfg(test)]
mod env_dispatch_tests {
use super::*;
fn with_env<F: FnOnce()>(key: &str, val: Option<&str>, f: F) {
let prev = std::env::var(key).ok();
match val {
Some(v) => unsafe { std::env::set_var(key, v) },
None => unsafe { std::env::remove_var(key) },
}
f();
match prev {
Some(v) => unsafe { std::env::set_var(key, v) },
None => unsafe { std::env::remove_var(key) },
}
}
#[test]
fn llm_backend_defaults_to_ollama() {
with_env("LLM_BACKEND", None, || {
assert!(!local_backend_is_llamacpp());
});
}
#[test]
fn llm_backend_llamacpp_case_insensitive() {
with_env("LLM_BACKEND", Some("LlamaCpp"), || {
assert!(local_backend_is_llamacpp());
});
with_env("LLM_BACKEND", Some(" llamacpp "), || {
assert!(local_backend_is_llamacpp());
});
}
#[test]
fn llm_backend_unknown_value_is_ollama() {
with_env("LLM_BACKEND", Some("vllm"), || {
assert!(!local_backend_is_llamacpp());
});
}
}

View File

@@ -281,6 +281,9 @@ impl SmsApiClient {
if let Some(cid) = params.contact_id {
url.push_str(&format!("&contact_id={}", cid));
}
if let Some(ref c) = params.contact {
url.push_str(&format!("&contact={}", urlencoding::encode(c)));
}
if let Some(off) = params.offset {
url.push_str(&format!("&offset={}", off));
}
@@ -413,6 +416,9 @@ pub struct SmsSearchParams<'a> {
pub mode: &'a str,
pub limit: usize,
pub contact_id: Option<i64>,
/// Contact name (case-insensitive). Resolved to a numeric ID by the
/// SMS-API server when `contact_id` is not set.
pub contact: Option<String>,
/// Unix-seconds inclusive lower bound on `date`.
pub date_from: Option<i64>,
/// Unix-seconds inclusive upper bound on `date`.

View File

@@ -195,6 +195,7 @@ async fn main() -> anyhow::Result<()> {
let generator = InsightGenerator::new(
ollama,
None,
None,
sms_client,
apollo_client,
insight_dao.clone(),

View File

@@ -62,6 +62,15 @@ pub fn large_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
.join(format!("{}.jpg", hash))
}
/// Hash-keyed xlarge-preview path: `<thumbs_dir>/_xlarge/<hash[..2]>/<hash>.jpg`.
pub fn xlarge_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
let shard = shard_prefix(hash);
thumbs_dir
.join("_xlarge")
.join(shard)
.join(format!("{}.jpg", hash))
}
/// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`.
/// The playlist lives at `playlist.m3u8` inside this directory and its
/// segments are co-located so HLS relative references Just Work. See

View File

@@ -194,6 +194,7 @@ pub enum MediaType {
#[serde(rename_all = "lowercase")]
pub enum PhotoSize {
Full,
XLarge,
Large,
Thumb,
}

View File

@@ -83,12 +83,14 @@ pub async fn get_image(
if let Some((library, path)) = resolved {
let image_size = req.size.unwrap_or(PhotoSize::Full);
// `size=large` is only meaningful for stills — there's no useful
// "2048px video preview" tier. Videos fall back to the existing
// thumb pipeline (which already handles gif/static selection).
// `mut` so the Large branch can downgrade itself to `Full` after a
// generation failure (RAW-preview branch below keys off `Full`).
let mut image_size = if image_size == PhotoSize::Large && file_types::is_video_file(&path) {
// `size=large|xlarge` is only meaningful for stills — there's no
// useful "resized video preview" tier. Videos fall back to the
// existing thumb pipeline (which already handles gif/static
// selection). `mut` so preview branches can downgrade to `Full`
// after a generation failure.
let mut image_size = if (image_size == PhotoSize::Large || image_size == PhotoSize::XLarge)
&& file_types::is_video_file(&path)
{
PhotoSize::Thumb
} else {
image_size
@@ -196,6 +198,93 @@ pub async fn get_image(
image_size = PhotoSize::Full;
}
if image_size == PhotoSize::XLarge {
let relative_path = path
.strip_prefix(&library.root_path)
.expect("Error stripping library root prefix from xlarge preview");
let relative_path_str = relative_path.to_string_lossy().replace('\\', "/");
let thumbs = Path::new(&app_state.thumbnail_path);
let xlarge_dir = thumbs.join("_xlarge");
let hash_xlarge_path: Option<PathBuf> = {
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
match dao.get_exif(&context, &relative_path_str) {
Ok(Some(row)) => row
.content_hash
.as_deref()
.map(|h| content_hash::xlarge_preview_path(thumbs, h)),
_ => None,
}
};
let scoped_legacy_xlarge_path =
content_hash::library_scoped_legacy_path(&xlarge_dir, library.id, relative_path);
let existing = hash_xlarge_path
.as_ref()
.filter(|p| p.exists())
.cloned()
.or_else(|| {
if scoped_legacy_xlarge_path.exists() {
Some(scoped_legacy_xlarge_path.clone())
} else {
None
}
});
if let Some(found) = existing {
if let Ok(file) = NamedFile::open(&found) {
span.set_status(Status::Ok);
return file
.use_etag(true)
.use_last_modified(true)
.prefer_utf8(true)
.into_response(&request);
}
}
let dest = hash_xlarge_path
.clone()
.unwrap_or_else(|| scoped_legacy_xlarge_path.clone());
let src = path.clone();
let dest_for_block = dest.clone();
let generated = web::block(move || {
if let Some(parent) = dest_for_block.parent() {
std::fs::create_dir_all(parent)?;
}
let tmp = dest_for_block.with_extension("jpg.tmp");
crate::thumbnails::generate_xlarge_preview(&src, &tmp)?;
std::fs::rename(&tmp, &dest_for_block)?;
Ok::<(), std::io::Error>(())
})
.await;
match generated {
Ok(Ok(())) => {
if let Ok(file) = NamedFile::open(&dest) {
span.set_status(Status::Ok);
return file
.use_etag(true)
.use_last_modified(true)
.prefer_utf8(true)
.into_response(&request);
}
}
Ok(Err(e)) => {
warn!(
"XLarge preview generation failed for {:?}: {} — falling back to original",
path, e
);
}
Err(e) => {
warn!(
"XLarge preview blocking-pool error for {:?}: {} — falling back to original",
path, e
);
}
}
image_size = PhotoSize::Full;
}
if image_size == PhotoSize::Thumb {
let relative_path = path
.strip_prefix(&library.root_path)

View File

@@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient;
use crate::ai::clip_client::ClipClient;
use crate::ai::face_client::FaceClient;
use crate::ai::insight_chat::{ChatLockMap, InsightChatService};
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::openrouter::OpenRouterClient;
use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient};
use crate::database::{
@@ -62,6 +63,16 @@ pub struct AppState {
/// Curated list of OpenRouter model ids exposed to clients. Sourced from
/// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset.
pub openrouter_allowed_models: Vec<String>,
/// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a
/// request explicitly opts into `backend=llamacpp`. Same shape as the
/// `openrouter` slot — present here so handlers can route to it without
/// threading through the generator.
#[allow(dead_code)]
pub llamacpp: Option<Arc<LlamaCppClient>>,
/// Curated list of llama-swap model ids exposed to clients. Sourced from
/// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the
/// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`.
pub llamacpp_allowed_models: Vec<String>,
pub sms_client: SmsApiClient,
pub insight_generator: InsightGenerator,
/// Chat continuation service. Hold an Arc so handlers can clone cheaply.
@@ -105,6 +116,8 @@ impl AppState {
ollama: OllamaClient,
openrouter: Option<Arc<OpenRouterClient>>,
openrouter_allowed_models: Vec<String>,
llamacpp: Option<Arc<LlamaCppClient>>,
llamacpp_allowed_models: Vec<String>,
sms_client: SmsApiClient,
insight_generator: InsightGenerator,
insight_chat: Arc<InsightChatService>,
@@ -145,6 +158,8 @@ impl AppState {
ollama,
openrouter,
openrouter_allowed_models,
llamacpp,
llamacpp_allowed_models,
sms_client,
insight_generator,
insight_chat,
@@ -186,6 +201,9 @@ impl Default for AppState {
let openrouter = build_openrouter_from_env();
let openrouter_allowed_models = parse_openrouter_allowed_models();
let llamacpp = build_llamacpp_from_env();
let llamacpp_allowed_models = parse_llamacpp_allowed_models();
let sms_api_url =
env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
let sms_api_token = env::var("SMS_API_TOKEN").ok();
@@ -250,6 +268,7 @@ impl Default for AppState {
let insight_generator = InsightGenerator::new(
ollama.clone(),
openrouter.clone(),
llamacpp.clone(),
sms_client.clone(),
apollo_client.clone(),
insight_dao.clone(),
@@ -271,8 +290,6 @@ impl Default for AppState {
Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
let insight_chat = Arc::new(InsightChatService::new(
Arc::new(insight_generator.clone()),
ollama.clone(),
openrouter.clone(),
insight_dao.clone(),
chat_locks,
));
@@ -294,6 +311,8 @@ impl Default for AppState {
ollama,
openrouter,
openrouter_allowed_models,
llamacpp,
llamacpp_allowed_models,
sms_client,
insight_generator,
insight_chat,
@@ -335,6 +354,37 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
.collect()
}
/// Build a `LlamaCppClient` from environment variables. Returns `None` when
/// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally
/// when the URL is set (so it's available even under `LLM_BACKEND=ollama`
/// for ad-hoc tooling), but the agentic / chat paths only route through it
/// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled
/// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`.
fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
let base_url = env::var("LLAMA_SWAP_URL").ok()?;
let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
let mut client = LlamaCppClient::new(Some(base_url), primary_model);
if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") {
client.set_embedding_model(model);
}
if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
client.set_vision_model(model);
}
Some(Arc::new(client))
}
/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
/// populate the model picker when `LLM_BACKEND=llamacpp` — `/insights/models`
/// surfaces these slots with capabilities. Empty when unset.
fn parse_llamacpp_allowed_models() -> Vec<String> {
env::var("LLAMA_SWAP_ALLOWED_MODELS")
.unwrap_or_default()
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
#[cfg(test)]
impl AppState {
/// Creates an AppState instance for testing with temporary directories
@@ -397,6 +447,7 @@ impl AppState {
let insight_generator = InsightGenerator::new(
ollama.clone(),
None,
None,
sms_client.clone(),
apollo_client.clone(),
insight_dao.clone(),
@@ -416,8 +467,6 @@ impl AppState {
Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
let insight_chat = Arc::new(InsightChatService::new(
Arc::new(insight_generator.clone()),
ollama.clone(),
None,
insight_dao.clone(),
chat_locks,
));
@@ -445,6 +494,8 @@ impl AppState {
ollama,
None,
Vec::new(),
None,
Vec::new(),
sms_client,
insight_generator,
insight_chat,

View File

@@ -36,12 +36,19 @@ use crate::video::actors::{generate_image_thumbnail_ffmpeg, generate_video_thumb
/// `size=full` and the handler streams the original bytes.
pub const LARGE_PREVIEW_MAX_DIM: u32 = 2048;
/// JPEG quality for the large preview tier. 85 is the conventional
/// "indistinguishable from source at viewing size" point — well above the
/// `image` crate's default ~75, but well below quality-90+ territory where
/// file size doubles for no perceptible win.
/// JPEG quality for the large and xlarge preview tiers. 85 is the
/// conventional "indistinguishable from source at viewing size" point —
/// well above the `image` crate's default ~75, but well below quality-90+
/// territory where file size doubles for no perceptible win.
const LARGE_PREVIEW_JPEG_QUALITY: u8 = 85;
/// Maximum long-edge size (px) for the xlarge preview tier. Bridges the
/// gap between `large` (2048px, ~16MB decoded) and the original bytes
/// (potentially 48+ MP / ~192MB decoded). At 4096px the decoded bitmap is
/// ~64MB — enough for 2-3× pinch-zoom on any phone before the viewer
/// needs to stream the true original.
pub const XLARGE_PREVIEW_MAX_DIM: u32 = 4096;
lazy_static! {
pub static ref IMAGE_GAUGE: IntGauge = IntGauge::new(
"imageserver_image_total",
@@ -205,6 +212,86 @@ fn generate_large_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()>
Ok(())
}
/// Generate the on-demand xlarge-preview tier (≈4096 long edge JPEG).
///
/// Same waterfall as [`generate_large_preview`] but targeting
/// [`XLARGE_PREVIEW_MAX_DIM`]. Sources whose long edge is already below
/// the cap are encoded at native size (no upscale).
pub fn generate_xlarge_preview(src: &Path, dest: &Path) -> std::io::Result<()> {
let orientation = exif::read_orientation(src).unwrap_or(1);
if let Some(preview) = exif::extract_embedded_jpeg_preview(src) {
let img = image::load_from_memory(&preview).map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("decode embedded preview {:?}: {}", src, e),
)
})?;
let img = exif::apply_orientation(img, orientation);
return encode_xlarge_jpeg(img, dest);
}
if file_types::needs_ffmpeg_thumbnail(src) {
return generate_xlarge_preview_ffmpeg(src, dest);
}
let img = image::open(src).map_err(|e| {
std::io::Error::new(std::io::ErrorKind::InvalidData, format!("{:?}: {}", src, e))
})?;
let img = exif::apply_orientation(img, orientation);
encode_xlarge_jpeg(img, dest)
}
fn encode_xlarge_jpeg(img: image::DynamicImage, dest: &Path) -> std::io::Result<()> {
let (w, h) = img.dimensions();
let max_dim = w.max(h);
let scaled = if max_dim > XLARGE_PREVIEW_MAX_DIM {
img.thumbnail(XLARGE_PREVIEW_MAX_DIM, XLARGE_PREVIEW_MAX_DIM)
} else {
img
};
let file = std::fs::File::create(dest)
.map_err(|e| std::io::Error::other(format!("create {:?}: {}", dest, e)))?;
let mut writer = std::io::BufWriter::new(file);
let mut encoder = JpegEncoder::new_with_quality(&mut writer, LARGE_PREVIEW_JPEG_QUALITY);
encoder
.encode_image(&scaled)
.map_err(|e| std::io::Error::other(format!("encode {:?}: {}", dest, e)))?;
Ok(())
}
fn generate_xlarge_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()> {
let vf = format!(
"scale='if(gt(iw,ih),min(iw,{cap}),-1)':'if(gt(iw,ih),-1,min(ih,{cap}))'",
cap = XLARGE_PREVIEW_MAX_DIM
);
let output = Command::new("ffmpeg")
.arg("-y")
.arg("-i")
.arg(src)
.arg("-vframes")
.arg("1")
.arg("-vf")
.arg(&vf)
.arg("-q:v")
.arg("5")
.arg("-f")
.arg("image2")
.arg("-c:v")
.arg("mjpeg")
.arg(dest)
.output()?;
if !output.status.success() {
return Err(std::io::Error::other(format!(
"ffmpeg failed ({}): {}",
output.status,
String::from_utf8_lossy(&output.stderr).trim()
)));
}
Ok(())
}
pub fn create_thumbnails(libs: &[libraries::Library], excluded_dirs: &[String]) {
let tracer = global_tracer();
let span = tracer.start("creating thumbnails");