diff --git a/.env.example b/.env.example index 2e431bc..64c31d3 100644 --- a/.env.example +++ b/.env.example @@ -80,6 +80,16 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed # LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 +# ── Unified search translation model (optional) ───────────────────────── +# /photos/search/unified runs one small LLM call to translate a natural- +# language query into structured filters + a semantic term, then CLIP-ranks. +# That step needs an LLM AND CLIP available at once. On a tight VRAM budget a +# large chat model can't co-reside with CLIP, so pin a small, fast model here +# (it can stay loaded alongside CLIP and the chat model). Precedence: +# UNIFIED_SEARCH_MODEL > the client's selected model > the configured default. +# Use the configured backend (LLM_BACKEND); local only — no hybrid. +# UNIFIED_SEARCH_MODEL=qwen3-0.6b + # ── Text-to-speech (optional, requires LLAMA_SWAP_URL) ─────────────────── # TTS routes through the same llama-swap proxy (a Chatterbox model id), so it # only needs LLAMA_SWAP_URL — it does NOT require LLM_BACKEND=llamacpp. diff --git a/src/unified_search.rs b/src/unified_search.rs index d80feec..bb6344c 100644 --- a/src/unified_search.rs +++ b/src/unified_search.rs @@ -172,12 +172,22 @@ pub async fn unified_search( }; // Respect env/config for the LLM backend (LLM_BACKEND → ollama or - // llama-swap); local only, no hybrid, per the feature's design. The - // client-supplied model (the user's current selection) routes translation - // to an already-loaded model when possible; otherwise resolve_backend - // falls back to the configured default. + // llama-swap); local only, no hybrid, per the feature's design. + // + // Translation-model precedence: + // 1. UNIFIED_SEARCH_MODEL env — pin a small, fast model that can stay + // co-resident with CLIP (and the chat model) so translation never + // evicts them. This is the recommended setup on a tight VRAM budget. + // 2. the client-selected model — routes translation to whatever the user + // already has loaded (no swap) when no dedicated model is pinned. + // 3. None → resolve_backend uses the configured default local model. + let translation_model = std::env::var("UNIFIED_SEARCH_MODEL") + .ok() + .filter(|m| !m.trim().is_empty()) + .or_else(|| query.model.clone()) + .filter(|m| !m.trim().is_empty()); let overrides = SamplingOverrides { - model: query.model.clone().filter(|m| !m.is_empty()), + model: translation_model, num_ctx: None, temperature: None, top_p: None, @@ -197,6 +207,7 @@ pub async fn unified_search( }); } }; + log::info!("unified_search: translating with model={}", backend.model()); let today = chrono::Utc::now().date_naive(); let sq = match translate_nl_query(backend.chat(), &nl, &tag_vocab, today).await {