fix: audit fixes for async insight jobs + persist generation params

- Fix query param mismatch: rename GenerationStatusQuery.file_path to
  path so the client's app-resume buildQuery({ path: ... }) resolves
  correctly instead of always getting 400
- Remove dead _lib_id bindings from both generate handlers
- Return 202 Accepted instead of 200 from generate endpoints
- Restore OpenTelemetry span instrumentation on generate handlers
- Remove stale UNIQUE constraint from initial migration (incompatible
  with plain-INSERT DAO)
- Add tests for status guard: complete_job/fail_job are no-ops when
  job is already cancelled, and cancel_job by id
- Persist generation params (num_ctx, temperature, top_p, top_k, min_p,
  system_prompt, persona_id) on the photo_insights table for auditing

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-27 13:02:15 -04:00
parent b87eb4e690
commit 2818936739
14 changed files with 786 additions and 194 deletions
+340 -90
View File
@@ -73,13 +73,13 @@ pub struct GenerationStatusQuery {
/// If provided with `library`, look up the latest running job for this
/// file. Used when the client doesn't have a persisted job_id.
#[serde(default)]
pub file_path: Option<String>,
pub path: Option<String>,
#[serde(default)]
pub library: Option<String>,
}
/// GET /insights/generation/status - Check status of a generation job.
/// Accepts either `?job_id=<id>` or `?file_path=<path>&library=<name>`.
/// Accepts either `?job_id=<id>` or `?path=<path>&library=<name>`.
#[get("/insights/generation/status")]
pub async fn generation_status_handler(
_claims: Claims,
@@ -118,7 +118,7 @@ pub async fn generation_status_handler(
}
}
if let Some(ref fp) = query.file_path {
if let Some(ref fp) = query.path {
let library = libraries::resolve_library_param(&app_state, query.library.as_deref())
.ok()
.flatten()
@@ -156,7 +156,115 @@ pub async fn generation_status_handler(
}
HttpResponse::BadRequest().json(serde_json::json!({
"error": "Provide either job_id or file_path query parameter"
"error": "Provide either job_id or path query parameter"
}))
}
#[derive(Debug, Deserialize)]
pub struct CancelGenerationRequest {
/// If provided, cancel the specific job by id.
#[serde(default)]
pub job_id: Option<i32>,
/// If provided with `library`, cancel all running jobs for this file.
#[serde(default)]
pub file_path: Option<String>,
#[serde(default)]
pub library: Option<String>,
}
/// POST /insights/generation/cancel - Cancel a running generation job.
/// Accepts either `job_id` or `file_path` + optional `library` in the body.
#[post("/insights/generation/cancel")]
pub async fn cancel_generation_handler(
_claims: Claims,
request: web::Json<CancelGenerationRequest>,
app_state: web::Data<AppState>,
) -> impl Responder {
let ctx = opentelemetry::Context::new();
if let Some(jid) = request.job_id {
let mut dao = app_state
.insight_job_dao
.lock()
.expect("Unable to lock InsightJobDao");
match dao.cancel_job(&ctx, jid) {
Ok(true) => {
let mut handles = app_state
.insight_job_handles
.lock()
.expect("Unable to lock InsightJobHandles");
if let Some(handle) = handles.remove(&jid) {
handle.abort();
}
return HttpResponse::Ok().json(serde_json::json!({
"success": true,
"message": format!("Job {} cancelled", jid)
}));
}
Ok(false) => {
return HttpResponse::Ok().json(serde_json::json!({
"success": true,
"message": format!("Job {} was not running", jid)
}));
}
Err(e) => {
log::error!("Failed to cancel job {}: {:?}", jid, e);
return HttpResponse::InternalServerError().json(serde_json::json!({
"error": "Failed to cancel job"
}));
}
}
}
if let Some(ref fp) = request.file_path {
let library = libraries::resolve_library_param(&app_state, request.library.as_deref())
.ok()
.flatten()
.unwrap_or_else(|| app_state.primary_library());
let normalized = normalize_path(fp);
// Get active job ids first, then cancel in DB, then abort tasks
let active_ids: Vec<i32> = {
let mut dao = app_state
.insight_job_dao
.lock()
.expect("Unable to lock InsightJobDao");
let ids = dao
.get_active_job(&ctx, library.id, &normalized)
.ok()
.flatten()
.map(|j| vec![j.id])
.unwrap_or_default();
let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized);
ids
};
if active_ids.is_empty() {
return HttpResponse::Ok().json(serde_json::json!({
"success": true,
"message": "No running generation job for this file"
}));
}
for jid in &active_ids {
if let Some(handle) = app_state
.insight_job_handles
.lock()
.expect("Unable to lock InsightJobHandles")
.remove(jid)
{
handle.abort();
}
}
return HttpResponse::Ok().json(serde_json::json!({
"success": true,
"message": format!("Cancelled {} running job(s) for {}", active_ids.len(), normalized)
}));
}
HttpResponse::BadRequest().json(serde_json::json!({
"error": "Provide either job_id or file_path in the request body"
}))
}
@@ -208,6 +316,20 @@ pub struct PhotoInsightResponse {
/// True when the insight was generated agentically and a chat
/// continuation can be started against it. Drives the mobile chat button.
pub has_training_messages: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub num_ctx: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub top_p: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub top_k: Option<i32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub min_p: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub system_prompt: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub persona_id: Option<String>,
}
#[derive(Debug, Serialize)]
@@ -227,33 +349,55 @@ pub struct ServerModels {
/// POST /insights/generate - Generate insight for a specific photo (async)
#[post("/insights/generate")]
pub async fn generate_insight_handler(
_http_request: HttpRequest,
http_request: HttpRequest,
_claims: Claims,
request: web::Json<GeneratePhotoInsightRequest>,
app_state: web::Data<AppState>,
) -> impl Responder {
let parent_context = extract_context_from_request(&http_request);
let tracer = global_tracer();
let mut span = tracer.start_with_context("http.insights.generate", &parent_context);
let normalized_path = normalize_path(&request.file_path);
let library = app_state.primary_library();
let gen_type = InsightGenerationType::Standard;
span.set_attribute(KeyValue::new("file_path", normalized_path.clone()));
if let Some(ref model) = request.model {
span.set_attribute(KeyValue::new("model", model.clone()));
}
log::info!(
"Manual insight generation triggered for photo: {} with model: {:?}",
normalized_path,
request.model
);
// Cancel any running job for this file, then create a fresh one
{
// Look up and abort any running job for this file, then cancel in DB
let old_job_ids: Vec<i32> = {
let mut dao = app_state
.insight_job_dao
.lock()
.expect("Unable to lock InsightJobDao");
let _ = dao.cancel_active_job(
&opentelemetry::Context::new(),
library.id,
&normalized_path,
gen_type,
);
let ctx = opentelemetry::Context::new();
let ids = dao
.get_active_job(&ctx, library.id, &normalized_path)
.ok()
.flatten()
.map(|j| vec![j.id])
.unwrap_or_default();
let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized_path);
ids
};
for jid in &old_job_ids {
if let Some(handle) = app_state
.insight_job_handles
.lock()
.expect("Unable to lock InsightJobHandles")
.remove(jid)
{
handle.abort();
}
}
let job_id = {
@@ -261,7 +405,7 @@ pub async fn generate_insight_handler(
.insight_job_dao
.lock()
.expect("Unable to lock InsightJobDao");
match dao.create_or_get_active_job(
match dao.create_job(
&opentelemetry::Context::new(),
library.id,
&normalized_path,
@@ -270,6 +414,7 @@ pub async fn generate_insight_handler(
Ok(id) => id,
Err(e) => {
log::error!("Failed to create generation job: {:?}", e);
span.set_status(Status::error("Failed to create generation job"));
return HttpResponse::InternalServerError().json(serde_json::json!({
"error": "Failed to create generation job"
}));
@@ -280,36 +425,40 @@ pub async fn generate_insight_handler(
// Spawn background task with timeout
let generator = app_state.insight_generator.clone();
let job_dao = app_state.insight_job_dao.clone();
let lib_id = library.id;
let job_handles = app_state.insight_job_handles.clone();
let path = normalized_path.clone();
tokio::spawn(async move {
let handle = tokio::spawn(async move {
let timeout_secs: u64 = std::env::var("INSIGHT_GENERATION_TIMEOUT_SECS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(120);
let result = tokio::time::timeout(
std::time::Duration::from_secs(timeout_secs),
generator.generate_insight_for_photo_with_config(
&path,
request.model.clone(),
request.system_prompt.clone(),
request.num_ctx,
request.temperature,
request.top_p,
request.top_k,
request.min_p,
),
)
let path_for_task = path.clone();
let generator_for_task = generator.clone();
let result = tokio::task::spawn(async move {
tokio::time::timeout(
std::time::Duration::from_secs(timeout_secs),
generator_for_task.generate_insight_for_photo_with_config(
&path_for_task,
request.model.clone(),
request.system_prompt.clone(),
request.num_ctx,
request.temperature,
request.top_p,
request.top_k,
request.min_p,
),
)
.await
})
.await;
let ctx = opentelemetry::Context::new();
let mut dao = job_dao.lock().expect("Unable to lock InsightJobDao");
match result {
Ok(Ok(())) => {
// Look up the stored insight id to record on the job
Ok(Ok(Ok(()))) => {
let mut insight_dao = generator
.insight_dao()
.lock()
@@ -320,27 +469,60 @@ pub async fn generate_insight_handler(
.flatten()
.map(|i| i.id);
if let Some(id) = insight_id {
let _ = dao.complete_job(&ctx, job_id, id);
if let Err(e) = dao.complete_job(&ctx, job_id, id) {
log::error!("Failed to mark job {} as completed: {:?}", job_id, e);
}
} else {
let _ = dao.fail_job(&ctx, job_id, "generation returned no insight");
if let Err(e) = dao.fail_job(&ctx, job_id, "generation returned no insight") {
log::error!("Failed to mark job {} as failed: {:?}", job_id, e);
}
}
}
Ok(Err(e)) => {
Ok(Ok(Err(e))) => {
log::error!("Insight generation failed for {}: {:?}", path, e);
let _ = dao.fail_job(&ctx, job_id, &format!("{:?}", e));
if let Err(err) = dao.fail_job(&ctx, job_id, &format!("{:?}", e)) {
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
}
}
Err(_) => {
Ok(Err(_)) => {
log::error!(
"Insight generation timed out for {} after {}s",
path,
timeout_secs
);
let _ = dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs));
if let Err(err) =
dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs))
{
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
}
}
Err(_) => {
log::error!("Insight generation task panicked for {}", path);
if let Err(err) = dao.fail_job(&ctx, job_id, "generation task panicked") {
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
}
}
}
// Remove handle from map on completion
let mut handles = job_handles
.lock()
.expect("Unable to lock InsightJobHandles");
handles.remove(&job_id);
});
HttpResponse::Ok().json(JobIdResponse { job_id })
// Store abort handle
{
let mut handles = app_state
.insight_job_handles
.lock()
.expect("Unable to lock InsightJobHandles");
handles.insert(job_id, handle.abort_handle());
}
span.set_attribute(KeyValue::new("job_id", job_id as i64));
span.set_status(Status::Ok);
HttpResponse::Accepted().json(JobIdResponse { job_id })
}
/// GET /insights?path=/path/to/photo.jpg - Fetch insight for specific photo
@@ -385,6 +567,13 @@ pub async fn get_insight_handler(
approved: insight.approved,
has_training_messages: insight.training_messages.is_some(),
backend: insight.backend,
num_ctx: insight.num_ctx,
temperature: insight.temperature,
top_p: insight.top_p,
top_k: insight.top_k,
min_p: insight.min_p,
system_prompt: insight.system_prompt,
persona_id: insight.persona_id,
};
HttpResponse::Ok().json(response)
}
@@ -454,6 +643,13 @@ pub async fn get_all_insights_handler(
approved: insight.approved,
has_training_messages: insight.training_messages.is_some(),
backend: insight.backend,
num_ctx: insight.num_ctx,
temperature: insight.temperature,
top_p: insight.top_p,
top_k: insight.top_k,
min_p: insight.min_p,
system_prompt: insight.system_prompt,
persona_id: insight.persona_id,
})
.collect();
@@ -471,33 +667,58 @@ pub async fn get_all_insights_handler(
/// POST /insights/generate/agentic - Generate insight using agentic tool-calling loop (async)
#[post("/insights/generate/agentic")]
pub async fn generate_agentic_insight_handler(
_http_request: HttpRequest,
http_request: HttpRequest,
claims: Claims,
request: web::Json<GeneratePhotoInsightRequest>,
app_state: web::Data<AppState>,
) -> impl Responder {
let parent_context = extract_context_from_request(&http_request);
let tracer = global_tracer();
let mut span = tracer.start_with_context("http.insights.generate_agentic", &parent_context);
let normalized_path = normalize_path(&request.file_path);
let library = app_state.primary_library();
let gen_type = InsightGenerationType::Agentic;
span.set_attribute(KeyValue::new("file_path", normalized_path.clone()));
if let Some(ref model) = request.model {
span.set_attribute(KeyValue::new("model", model.clone()));
}
if let Some(ref backend) = request.backend {
span.set_attribute(KeyValue::new("backend", backend.clone()));
}
log::info!(
"Agentic insight generation triggered for photo: {} with model: {:?}",
normalized_path,
request.model
);
// Cancel any running job for this file, then create a fresh one
{
// Look up and abort any running job for this file, then cancel in DB
let old_job_ids: Vec<i32> = {
let mut dao = app_state
.insight_job_dao
.lock()
.expect("Unable to lock InsightJobDao");
let _ = dao.cancel_active_job(
&opentelemetry::Context::new(),
library.id,
&normalized_path,
gen_type,
);
let ctx = opentelemetry::Context::new();
let ids = dao
.get_active_job(&ctx, library.id, &normalized_path)
.ok()
.flatten()
.map(|j| vec![j.id])
.unwrap_or_default();
let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized_path);
ids
};
for jid in &old_job_ids {
if let Some(handle) = app_state
.insight_job_handles
.lock()
.expect("Unable to lock InsightJobHandles")
.remove(jid)
{
handle.abort();
}
}
let job_id = {
@@ -505,7 +726,7 @@ pub async fn generate_agentic_insight_handler(
.insight_job_dao
.lock()
.expect("Unable to lock InsightJobDao");
match dao.create_or_get_active_job(
match dao.create_job(
&opentelemetry::Context::new(),
library.id,
&normalized_path,
@@ -514,6 +735,7 @@ pub async fn generate_agentic_insight_handler(
Ok(id) => id,
Err(e) => {
log::error!("Failed to create agentic generation job: {:?}", e);
span.set_status(Status::error("Failed to create generation job"));
return HttpResponse::InternalServerError().json(serde_json::json!({
"error": "Failed to create generation job"
}));
@@ -573,73 +795,101 @@ pub async fn generate_agentic_insight_handler(
// Spawn background task with timeout
let generator = app_state.insight_generator.clone();
let job_dao = app_state.insight_job_dao.clone();
let lib_id = library.id;
let job_handles = app_state.insight_job_handles.clone();
let path = normalized_path.clone();
tokio::spawn(async move {
let handle = tokio::spawn(async move {
let timeout_secs: u64 = std::env::var("INSIGHT_GENERATION_TIMEOUT_SECS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(180);
let result = tokio::time::timeout(
std::time::Duration::from_secs(timeout_secs),
generator.generate_agentic_insight_for_photo(
&path,
request.model.clone(),
request.system_prompt.clone(),
request.num_ctx,
request.temperature,
request.top_p,
request.top_k,
request.min_p,
max_iterations,
request.backend.clone(),
fewshot_examples,
fewshot_ids,
user_id,
persona_id,
),
)
let path_for_task = path.clone();
let generator_for_task = generator.clone();
let result = tokio::task::spawn(async move {
tokio::time::timeout(
std::time::Duration::from_secs(timeout_secs),
generator_for_task.generate_agentic_insight_for_photo(
&path_for_task,
request.model.clone(),
request.system_prompt.clone(),
request.num_ctx,
request.temperature,
request.top_p,
request.top_k,
request.min_p,
max_iterations,
request.backend.clone(),
fewshot_examples,
fewshot_ids,
user_id,
persona_id,
),
)
.await
})
.await;
let ctx = opentelemetry::Context::new();
let mut dao = job_dao.lock().expect("Unable to lock InsightJobDao");
match result {
Ok(Ok(_)) => {
// Fetch the stored insight id to record on the job
let mut insight_dao = generator
.insight_dao()
.lock()
.expect("Unable to lock InsightDao");
let insight_id = insight_dao
.get_insight(&ctx, &path)
.ok()
.flatten()
.map(|i| i.id);
if let Some(id) = insight_id {
let _ = dao.complete_job(&ctx, job_id, id);
} else {
let _ = dao.fail_job(&ctx, job_id, "generation returned no insight");
Ok(Ok(Ok((Some(insight_id), _)))) => {
if let Err(e) = dao.complete_job(&ctx, job_id, insight_id) {
log::error!("Failed to mark job {} as completed: {:?}", job_id, e);
}
}
Ok(Err(e)) => {
log::error!("Agentic insight generation failed for {}: {:?}", path, e);
let _ = dao.fail_job(&ctx, job_id, &format!("{:?}", e));
Ok(Ok(Ok((None, _)))) => {
if let Err(e) = dao.fail_job(&ctx, job_id, "agentic generation returned no insight")
{
log::error!("Failed to mark job {} as failed: {:?}", job_id, e);
}
}
Err(_) => {
Ok(Ok(Err(e))) => {
log::error!("Agentic insight generation failed for {}: {:?}", path, e);
if let Err(err) = dao.fail_job(&ctx, job_id, &format!("{:?}", e)) {
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
}
}
Ok(Err(_)) => {
log::error!(
"Agentic insight generation timed out for {} after {}s",
path,
timeout_secs
);
let _ = dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs));
if let Err(err) =
dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs))
{
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
}
}
Err(_) => {
log::error!("Agentic insight generation task panicked for {}", path);
if let Err(err) = dao.fail_job(&ctx, job_id, "generation task panicked") {
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
}
}
}
// Remove handle from map on completion
let mut handles = job_handles
.lock()
.expect("Unable to lock InsightJobHandles");
handles.remove(&job_id);
});
HttpResponse::Ok().json(JobIdResponse { job_id })
// Store abort handle
{
let mut handles = app_state
.insight_job_handles
.lock()
.expect("Unable to lock InsightJobHandles");
handles.insert(job_id, handle.abort_handle());
}
span.set_attribute(KeyValue::new("job_id", job_id as i64));
span.set_status(Status::Ok);
HttpResponse::Accepted().json(JobIdResponse { job_id })
}
/// GET /insights/models - Local-backend models with capabilities. Returns