2 changed files with 26 additions and 5 deletions
@@ -80,6 +80,16 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
 # LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180

+# ── Unified search translation model (optional) ─────────────────────────
+# /photos/search/unified runs one small LLM call to translate a natural-
+# language query into structured filters + a semantic term, then CLIP-ranks.
+# That step needs an LLM AND CLIP available at once. On a tight VRAM budget a
+# large chat model can't co-reside with CLIP, so pin a small, fast model here
+# (it can stay loaded alongside CLIP and the chat model). Precedence:
+# UNIFIED_SEARCH_MODEL > the client's selected model > the configured default.
+# Use the configured backend (LLM_BACKEND); local only — no hybrid.
+# UNIFIED_SEARCH_MODEL=qwen3-0.6b
+
 # ── Text-to-speech (optional, requires LLAMA_SWAP_URL) ───────────────────
 # TTS routes through the same llama-swap proxy (a Chatterbox model id), so it
 # only needs LLAMA_SWAP_URL — it does NOT require LLM_BACKEND=llamacpp.
@@ -172,12 +172,22 @@ pub async fn unified_search<TagD: TagDao>(
    };

    // Respect env/config for the LLM backend (LLM_BACKEND → ollama or
-    // llama-swap); local only, no hybrid, per the feature's design. The
-    // client-supplied model (the user's current selection) routes translation
-    // to an already-loaded model when possible; otherwise resolve_backend
-    // falls back to the configured default.
+    // llama-swap); local only, no hybrid, per the feature's design.
+    //
+    // Translation-model precedence:
+    //   1. UNIFIED_SEARCH_MODEL env — pin a small, fast model that can stay
+    //      co-resident with CLIP (and the chat model) so translation never
+    //      evicts them. This is the recommended setup on a tight VRAM budget.
+    //   2. the client-selected model — routes translation to whatever the user
+    //      already has loaded (no swap) when no dedicated model is pinned.
+    //   3. None → resolve_backend uses the configured default local model.
+    let translation_model = std::env::var("UNIFIED_SEARCH_MODEL")
+        .ok()
+        .filter(|m| !m.trim().is_empty())
+        .or_else(|| query.model.clone())
+        .filter(|m| !m.trim().is_empty());
    let overrides = SamplingOverrides {
-        model: query.model.clone().filter(|m| !m.is_empty()),
+        model: translation_model,
        num_ctx: None,
        temperature: None,
        top_p: None,
@@ -197,6 +207,7 @@ pub async fn unified_search<TagD: TagDao>(
            });
        }
    };
+    log::info!("unified_search: translating with model={}", backend.model());

    let today = chrono::Utc::now().date_naive();
    let sq = match translate_nl_query(backend.chat(), &nl, &tag_vocab, today).await {