Unified search: UNIFIED_SEARCH_MODEL env override for the translation step
Pin the NL->structured translation to a small, fast model that can stay co-resident with CLIP (and the chat model) so it never evicts them on a tight VRAM budget. Precedence: UNIFIED_SEARCH_MODEL env > client-selected model > configured default. Logs the effective model (backend.model()) so model A/B tests are visible. Documented in .env.example. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -80,6 +80,16 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
|
|||||||
# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
|
# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
|
||||||
# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
|
# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
|
||||||
|
|
||||||
|
# ── Unified search translation model (optional) ─────────────────────────
|
||||||
|
# /photos/search/unified runs one small LLM call to translate a natural-
|
||||||
|
# language query into structured filters + a semantic term, then CLIP-ranks.
|
||||||
|
# That step needs an LLM AND CLIP available at once. On a tight VRAM budget a
|
||||||
|
# large chat model can't co-reside with CLIP, so pin a small, fast model here
|
||||||
|
# (it can stay loaded alongside CLIP and the chat model). Precedence:
|
||||||
|
# UNIFIED_SEARCH_MODEL > the client's selected model > the configured default.
|
||||||
|
# Use the configured backend (LLM_BACKEND); local only — no hybrid.
|
||||||
|
# UNIFIED_SEARCH_MODEL=qwen3-0.6b
|
||||||
|
|
||||||
# ── Text-to-speech (optional, requires LLAMA_SWAP_URL) ───────────────────
|
# ── Text-to-speech (optional, requires LLAMA_SWAP_URL) ───────────────────
|
||||||
# TTS routes through the same llama-swap proxy (a Chatterbox model id), so it
|
# TTS routes through the same llama-swap proxy (a Chatterbox model id), so it
|
||||||
# only needs LLAMA_SWAP_URL — it does NOT require LLM_BACKEND=llamacpp.
|
# only needs LLAMA_SWAP_URL — it does NOT require LLM_BACKEND=llamacpp.
|
||||||
|
|||||||
+16
-5
@@ -172,12 +172,22 @@ pub async fn unified_search<TagD: TagDao>(
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Respect env/config for the LLM backend (LLM_BACKEND → ollama or
|
// Respect env/config for the LLM backend (LLM_BACKEND → ollama or
|
||||||
// llama-swap); local only, no hybrid, per the feature's design. The
|
// llama-swap); local only, no hybrid, per the feature's design.
|
||||||
// client-supplied model (the user's current selection) routes translation
|
//
|
||||||
// to an already-loaded model when possible; otherwise resolve_backend
|
// Translation-model precedence:
|
||||||
// falls back to the configured default.
|
// 1. UNIFIED_SEARCH_MODEL env — pin a small, fast model that can stay
|
||||||
|
// co-resident with CLIP (and the chat model) so translation never
|
||||||
|
// evicts them. This is the recommended setup on a tight VRAM budget.
|
||||||
|
// 2. the client-selected model — routes translation to whatever the user
|
||||||
|
// already has loaded (no swap) when no dedicated model is pinned.
|
||||||
|
// 3. None → resolve_backend uses the configured default local model.
|
||||||
|
let translation_model = std::env::var("UNIFIED_SEARCH_MODEL")
|
||||||
|
.ok()
|
||||||
|
.filter(|m| !m.trim().is_empty())
|
||||||
|
.or_else(|| query.model.clone())
|
||||||
|
.filter(|m| !m.trim().is_empty());
|
||||||
let overrides = SamplingOverrides {
|
let overrides = SamplingOverrides {
|
||||||
model: query.model.clone().filter(|m| !m.is_empty()),
|
model: translation_model,
|
||||||
num_ctx: None,
|
num_ctx: None,
|
||||||
temperature: None,
|
temperature: None,
|
||||||
top_p: None,
|
top_p: None,
|
||||||
@@ -197,6 +207,7 @@ pub async fn unified_search<TagD: TagDao>(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
log::info!("unified_search: translating with model={}", backend.model());
|
||||||
|
|
||||||
let today = chrono::Utc::now().date_naive();
|
let today = chrono::Utc::now().date_naive();
|
||||||
let sq = match translate_nl_query(backend.chat(), &nl, &tag_vocab, today).await {
|
let sq = match translate_nl_query(backend.chat(), &nl, &tag_vocab, today).await {
|
||||||
|
|||||||
Reference in New Issue
Block a user