fix: audit fixes for async insight jobs + persist generation params

- Fix query param mismatch: rename GenerationStatusQuery.file_path to path so the client's app-resume buildQuery({ path: ... }) resolves correctly instead of always getting 400 - Remove dead _lib_id bindings from both generate handlers - Return 202 Accepted instead of 200 from generate endpoints - Restore OpenTelemetry span instrumentation on generate handlers - Remove stale UNIQUE constraint from initial migration (incompatible with plain-INSERT DAO) - Add tests for status guard: complete_job/fail_job are no-ops when job is already cancelled, and cancel_job by id - Persist generation params (num_ctx, temperature, top_p, top_k, min_p, system_prompt, persona_id) on the photo_insights table for auditing Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 13:02:15 -04:00
parent b87eb4e690
commit 2818936739
14 changed files with 786 additions and 194 deletions
@@ -73,13 +73,13 @@ pub struct GenerationStatusQuery {
    /// If provided with `library`, look up the latest running job for this
    /// file. Used when the client doesn't have a persisted job_id.
    #[serde(default)]
-    pub file_path: Option<String>,
+    pub path: Option<String>,
    #[serde(default)]
    pub library: Option<String>,
 }

 /// GET /insights/generation/status - Check status of a generation job.
-/// Accepts either `?job_id=<id>` or `?file_path=<path>&library=<name>`.
+/// Accepts either `?job_id=<id>` or `?path=<path>&library=<name>`.
 #[get("/insights/generation/status")]
 pub async fn generation_status_handler(
    _claims: Claims,
@@ -118,7 +118,7 @@ pub async fn generation_status_handler(
        }
    }

-    if let Some(ref fp) = query.file_path {
+    if let Some(ref fp) = query.path {
        let library = libraries::resolve_library_param(&app_state, query.library.as_deref())
            .ok()
            .flatten()
@@ -156,7 +156,115 @@ pub async fn generation_status_handler(
    }

    HttpResponse::BadRequest().json(serde_json::json!({
-        "error": "Provide either job_id or file_path query parameter"
+        "error": "Provide either job_id or path query parameter"
+    }))
+}
+
+#[derive(Debug, Deserialize)]
+pub struct CancelGenerationRequest {
+    /// If provided, cancel the specific job by id.
+    #[serde(default)]
+    pub job_id: Option<i32>,
+    /// If provided with `library`, cancel all running jobs for this file.
+    #[serde(default)]
+    pub file_path: Option<String>,
+    #[serde(default)]
+    pub library: Option<String>,
+}
+
+/// POST /insights/generation/cancel - Cancel a running generation job.
+/// Accepts either `job_id` or `file_path` + optional `library` in the body.
+#[post("/insights/generation/cancel")]
+pub async fn cancel_generation_handler(
+    _claims: Claims,
+    request: web::Json<CancelGenerationRequest>,
+    app_state: web::Data<AppState>,
+) -> impl Responder {
+    let ctx = opentelemetry::Context::new();
+
+    if let Some(jid) = request.job_id {
+        let mut dao = app_state
+            .insight_job_dao
+            .lock()
+            .expect("Unable to lock InsightJobDao");
+        match dao.cancel_job(&ctx, jid) {
+            Ok(true) => {
+                let mut handles = app_state
+                    .insight_job_handles
+                    .lock()
+                    .expect("Unable to lock InsightJobHandles");
+                if let Some(handle) = handles.remove(&jid) {
+                    handle.abort();
+                }
+                return HttpResponse::Ok().json(serde_json::json!({
+                    "success": true,
+                    "message": format!("Job {} cancelled", jid)
+                }));
+            }
+            Ok(false) => {
+                return HttpResponse::Ok().json(serde_json::json!({
+                    "success": true,
+                    "message": format!("Job {} was not running", jid)
+                }));
+            }
+            Err(e) => {
+                log::error!("Failed to cancel job {}: {:?}", jid, e);
+                return HttpResponse::InternalServerError().json(serde_json::json!({
+                    "error": "Failed to cancel job"
+                }));
+            }
+        }
+    }
+
+    if let Some(ref fp) = request.file_path {
+        let library = libraries::resolve_library_param(&app_state, request.library.as_deref())
+            .ok()
+            .flatten()
+            .unwrap_or_else(|| app_state.primary_library());
+        let normalized = normalize_path(fp);
+
+        // Get active job ids first, then cancel in DB, then abort tasks
+        let active_ids: Vec<i32> = {
+            let mut dao = app_state
+                .insight_job_dao
+                .lock()
+                .expect("Unable to lock InsightJobDao");
+            let ids = dao
+                .get_active_job(&ctx, library.id, &normalized)
+                .ok()
+                .flatten()
+                .map(|j| vec![j.id])
+                .unwrap_or_default();
+            let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized);
+            ids
+        };
+
+        if active_ids.is_empty() {
+            return HttpResponse::Ok().json(serde_json::json!({
+                "success": true,
+                "message": "No running generation job for this file"
+            }));
+        }
+
+        for jid in &active_ids {
+            if let Some(handle) = app_state
+                .insight_job_handles
+                .lock()
+                .expect("Unable to lock InsightJobHandles")
+                .remove(jid)
+            {
+                handle.abort();
+            }
+        }
+
+        return HttpResponse::Ok().json(serde_json::json!({
+            "success": true,
+            "message": format!("Cancelled {} running job(s) for {}", active_ids.len(), normalized)
+        }));
+    }
+
+    HttpResponse::BadRequest().json(serde_json::json!({
+        "error": "Provide either job_id or file_path in the request body"
    }))
 }

@@ -208,6 +316,20 @@ pub struct PhotoInsightResponse {
    /// True when the insight was generated agentically and a chat
    /// continuation can be started against it. Drives the mobile chat button.
    pub has_training_messages: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub num_ctx: Option<i32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_k: Option<i32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub min_p: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system_prompt: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub persona_id: Option<String>,
 }

 #[derive(Debug, Serialize)]
@@ -227,33 +349,55 @@ pub struct ServerModels {
 /// POST /insights/generate - Generate insight for a specific photo (async)
 #[post("/insights/generate")]
 pub async fn generate_insight_handler(
-    _http_request: HttpRequest,
+    http_request: HttpRequest,
    _claims: Claims,
    request: web::Json<GeneratePhotoInsightRequest>,
    app_state: web::Data<AppState>,
 ) -> impl Responder {
+    let parent_context = extract_context_from_request(&http_request);
+    let tracer = global_tracer();
+    let mut span = tracer.start_with_context("http.insights.generate", &parent_context);
+
    let normalized_path = normalize_path(&request.file_path);
    let library = app_state.primary_library();
    let gen_type = InsightGenerationType::Standard;

+    span.set_attribute(KeyValue::new("file_path", normalized_path.clone()));
+    if let Some(ref model) = request.model {
+        span.set_attribute(KeyValue::new("model", model.clone()));
+    }
+
    log::info!(
        "Manual insight generation triggered for photo: {} with model: {:?}",
        normalized_path,
        request.model
    );

-    // Cancel any running job for this file, then create a fresh one
-    {
+    // Look up and abort any running job for this file, then cancel in DB
+    let old_job_ids: Vec<i32> = {
        let mut dao = app_state
            .insight_job_dao
            .lock()
            .expect("Unable to lock InsightJobDao");
-        let _ = dao.cancel_active_job(
-            &opentelemetry::Context::new(),
-            library.id,
-            &normalized_path,
-            gen_type,
-        );
+        let ctx = opentelemetry::Context::new();
+        let ids = dao
+            .get_active_job(&ctx, library.id, &normalized_path)
+            .ok()
+            .flatten()
+            .map(|j| vec![j.id])
+            .unwrap_or_default();
+        let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized_path);
+        ids
+    };
+    for jid in &old_job_ids {
+        if let Some(handle) = app_state
+            .insight_job_handles
+            .lock()
+            .expect("Unable to lock InsightJobHandles")
+            .remove(jid)
+        {
+            handle.abort();
+        }
    }

    let job_id = {
@@ -261,7 +405,7 @@ pub async fn generate_insight_handler(
            .insight_job_dao
            .lock()
            .expect("Unable to lock InsightJobDao");
-        match dao.create_or_get_active_job(
+        match dao.create_job(
            &opentelemetry::Context::new(),
            library.id,
            &normalized_path,
@@ -270,6 +414,7 @@ pub async fn generate_insight_handler(
            Ok(id) => id,
            Err(e) => {
                log::error!("Failed to create generation job: {:?}", e);
+                span.set_status(Status::error("Failed to create generation job"));
                return HttpResponse::InternalServerError().json(serde_json::json!({
                    "error": "Failed to create generation job"
                }));
@@ -280,36 +425,40 @@ pub async fn generate_insight_handler(
    // Spawn background task with timeout
    let generator = app_state.insight_generator.clone();
    let job_dao = app_state.insight_job_dao.clone();
-    let lib_id = library.id;
+    let job_handles = app_state.insight_job_handles.clone();
    let path = normalized_path.clone();

-    tokio::spawn(async move {
+    let handle = tokio::spawn(async move {
        let timeout_secs: u64 = std::env::var("INSIGHT_GENERATION_TIMEOUT_SECS")
            .ok()
            .and_then(|v| v.parse().ok())
            .unwrap_or(120);

-        let result = tokio::time::timeout(
-            std::time::Duration::from_secs(timeout_secs),
-            generator.generate_insight_for_photo_with_config(
-                &path,
-                request.model.clone(),
-                request.system_prompt.clone(),
-                request.num_ctx,
-                request.temperature,
-                request.top_p,
-                request.top_k,
-                request.min_p,
-            ),
-        )
+        let path_for_task = path.clone();
+        let generator_for_task = generator.clone();
+        let result = tokio::task::spawn(async move {
+            tokio::time::timeout(
+                std::time::Duration::from_secs(timeout_secs),
+                generator_for_task.generate_insight_for_photo_with_config(
+                    &path_for_task,
+                    request.model.clone(),
+                    request.system_prompt.clone(),
+                    request.num_ctx,
+                    request.temperature,
+                    request.top_p,
+                    request.top_k,
+                    request.min_p,
+                ),
+            )
+            .await
+        })
        .await;

        let ctx = opentelemetry::Context::new();
        let mut dao = job_dao.lock().expect("Unable to lock InsightJobDao");

        match result {
-            Ok(Ok(())) => {
-                // Look up the stored insight id to record on the job
+            Ok(Ok(Ok(()))) => {
                let mut insight_dao = generator
                    .insight_dao()
                    .lock()
@@ -320,27 +469,60 @@ pub async fn generate_insight_handler(
                    .flatten()
                    .map(|i| i.id);
                if let Some(id) = insight_id {
-                    let _ = dao.complete_job(&ctx, job_id, id);
+                    if let Err(e) = dao.complete_job(&ctx, job_id, id) {
+                        log::error!("Failed to mark job {} as completed: {:?}", job_id, e);
+                    }
                } else {
-                    let _ = dao.fail_job(&ctx, job_id, "generation returned no insight");
+                    if let Err(e) = dao.fail_job(&ctx, job_id, "generation returned no insight") {
+                        log::error!("Failed to mark job {} as failed: {:?}", job_id, e);
+                    }
                }
            }
-            Ok(Err(e)) => {
+            Ok(Ok(Err(e))) => {
                log::error!("Insight generation failed for {}: {:?}", path, e);
-                let _ = dao.fail_job(&ctx, job_id, &format!("{:?}", e));
+                if let Err(err) = dao.fail_job(&ctx, job_id, &format!("{:?}", e)) {
+                    log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
+                }
            }
-            Err(_) => {
+            Ok(Err(_)) => {
                log::error!(
                    "Insight generation timed out for {} after {}s",
                    path,
                    timeout_secs
                );
-                let _ = dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs));
+                if let Err(err) =
+                    dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs))
+                {
+                    log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
+                }
+            }
+            Err(_) => {
+                log::error!("Insight generation task panicked for {}", path);
+                if let Err(err) = dao.fail_job(&ctx, job_id, "generation task panicked") {
+                    log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
+                }
            }
        }
+
+        // Remove handle from map on completion
+        let mut handles = job_handles
+            .lock()
+            .expect("Unable to lock InsightJobHandles");
+        handles.remove(&job_id);
    });

-    HttpResponse::Ok().json(JobIdResponse { job_id })
+    // Store abort handle
+    {
+        let mut handles = app_state
+            .insight_job_handles
+            .lock()
+            .expect("Unable to lock InsightJobHandles");
+        handles.insert(job_id, handle.abort_handle());
+    }
+
+    span.set_attribute(KeyValue::new("job_id", job_id as i64));
+    span.set_status(Status::Ok);
+    HttpResponse::Accepted().json(JobIdResponse { job_id })
 }

 /// GET /insights?path=/path/to/photo.jpg - Fetch insight for specific photo
@@ -385,6 +567,13 @@ pub async fn get_insight_handler(
                approved: insight.approved,
                has_training_messages: insight.training_messages.is_some(),
                backend: insight.backend,
+                num_ctx: insight.num_ctx,
+                temperature: insight.temperature,
+                top_p: insight.top_p,
+                top_k: insight.top_k,
+                min_p: insight.min_p,
+                system_prompt: insight.system_prompt,
+                persona_id: insight.persona_id,
            };
            HttpResponse::Ok().json(response)
        }
@@ -454,6 +643,13 @@ pub async fn get_all_insights_handler(
                    approved: insight.approved,
                    has_training_messages: insight.training_messages.is_some(),
                    backend: insight.backend,
+                    num_ctx: insight.num_ctx,
+                    temperature: insight.temperature,
+                    top_p: insight.top_p,
+                    top_k: insight.top_k,
+                    min_p: insight.min_p,
+                    system_prompt: insight.system_prompt,
+                    persona_id: insight.persona_id,
                })
                .collect();

@@ -471,33 +667,58 @@ pub async fn get_all_insights_handler(
 /// POST /insights/generate/agentic - Generate insight using agentic tool-calling loop (async)
 #[post("/insights/generate/agentic")]
 pub async fn generate_agentic_insight_handler(
-    _http_request: HttpRequest,
+    http_request: HttpRequest,
    claims: Claims,
    request: web::Json<GeneratePhotoInsightRequest>,
    app_state: web::Data<AppState>,
 ) -> impl Responder {
+    let parent_context = extract_context_from_request(&http_request);
+    let tracer = global_tracer();
+    let mut span = tracer.start_with_context("http.insights.generate_agentic", &parent_context);
+
    let normalized_path = normalize_path(&request.file_path);
    let library = app_state.primary_library();
    let gen_type = InsightGenerationType::Agentic;

+    span.set_attribute(KeyValue::new("file_path", normalized_path.clone()));
+    if let Some(ref model) = request.model {
+        span.set_attribute(KeyValue::new("model", model.clone()));
+    }
+    if let Some(ref backend) = request.backend {
+        span.set_attribute(KeyValue::new("backend", backend.clone()));
+    }
+
    log::info!(
        "Agentic insight generation triggered for photo: {} with model: {:?}",
        normalized_path,
        request.model
    );

-    // Cancel any running job for this file, then create a fresh one
-    {
+    // Look up and abort any running job for this file, then cancel in DB
+    let old_job_ids: Vec<i32> = {
        let mut dao = app_state
            .insight_job_dao
            .lock()
            .expect("Unable to lock InsightJobDao");
-        let _ = dao.cancel_active_job(
-            &opentelemetry::Context::new(),
-            library.id,
-            &normalized_path,
-            gen_type,
-        );
+        let ctx = opentelemetry::Context::new();
+        let ids = dao
+            .get_active_job(&ctx, library.id, &normalized_path)
+            .ok()
+            .flatten()
+            .map(|j| vec![j.id])
+            .unwrap_or_default();
+        let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized_path);
+        ids
+    };
+    for jid in &old_job_ids {
+        if let Some(handle) = app_state
+            .insight_job_handles
+            .lock()
+            .expect("Unable to lock InsightJobHandles")
+            .remove(jid)
+        {
+            handle.abort();
+        }
    }

    let job_id = {
@@ -505,7 +726,7 @@ pub async fn generate_agentic_insight_handler(
            .insight_job_dao
            .lock()
            .expect("Unable to lock InsightJobDao");
-        match dao.create_or_get_active_job(
+        match dao.create_job(
            &opentelemetry::Context::new(),
            library.id,
            &normalized_path,
@@ -514,6 +735,7 @@ pub async fn generate_agentic_insight_handler(
            Ok(id) => id,
            Err(e) => {
                log::error!("Failed to create agentic generation job: {:?}", e);
+                span.set_status(Status::error("Failed to create generation job"));
                return HttpResponse::InternalServerError().json(serde_json::json!({
                    "error": "Failed to create generation job"
                }));
@@ -573,73 +795,101 @@ pub async fn generate_agentic_insight_handler(
    // Spawn background task with timeout
    let generator = app_state.insight_generator.clone();
    let job_dao = app_state.insight_job_dao.clone();
-    let lib_id = library.id;
+    let job_handles = app_state.insight_job_handles.clone();
    let path = normalized_path.clone();

-    tokio::spawn(async move {
+    let handle = tokio::spawn(async move {
        let timeout_secs: u64 = std::env::var("INSIGHT_GENERATION_TIMEOUT_SECS")
            .ok()
            .and_then(|v| v.parse().ok())
            .unwrap_or(180);

-        let result = tokio::time::timeout(
-            std::time::Duration::from_secs(timeout_secs),
-            generator.generate_agentic_insight_for_photo(
-                &path,
-                request.model.clone(),
-                request.system_prompt.clone(),
-                request.num_ctx,
-                request.temperature,
-                request.top_p,
-                request.top_k,
-                request.min_p,
-                max_iterations,
-                request.backend.clone(),
-                fewshot_examples,
-                fewshot_ids,
-                user_id,
-                persona_id,
-            ),
-        )
+        let path_for_task = path.clone();
+        let generator_for_task = generator.clone();
+        let result = tokio::task::spawn(async move {
+            tokio::time::timeout(
+                std::time::Duration::from_secs(timeout_secs),
+                generator_for_task.generate_agentic_insight_for_photo(
+                    &path_for_task,
+                    request.model.clone(),
+                    request.system_prompt.clone(),
+                    request.num_ctx,
+                    request.temperature,
+                    request.top_p,
+                    request.top_k,
+                    request.min_p,
+                    max_iterations,
+                    request.backend.clone(),
+                    fewshot_examples,
+                    fewshot_ids,
+                    user_id,
+                    persona_id,
+                ),
+            )
+            .await
+        })
        .await;

        let ctx = opentelemetry::Context::new();
        let mut dao = job_dao.lock().expect("Unable to lock InsightJobDao");

        match result {
-            Ok(Ok(_)) => {
-                // Fetch the stored insight id to record on the job
-                let mut insight_dao = generator
-                    .insight_dao()
-                    .lock()
-                    .expect("Unable to lock InsightDao");
-                let insight_id = insight_dao
-                    .get_insight(&ctx, &path)
-                    .ok()
-                    .flatten()
-                    .map(|i| i.id);
-                if let Some(id) = insight_id {
-                    let _ = dao.complete_job(&ctx, job_id, id);
-                } else {
-                    let _ = dao.fail_job(&ctx, job_id, "generation returned no insight");
+            Ok(Ok(Ok((Some(insight_id), _)))) => {
+                if let Err(e) = dao.complete_job(&ctx, job_id, insight_id) {
+                    log::error!("Failed to mark job {} as completed: {:?}", job_id, e);
                }
            }
-            Ok(Err(e)) => {
-                log::error!("Agentic insight generation failed for {}: {:?}", path, e);
-                let _ = dao.fail_job(&ctx, job_id, &format!("{:?}", e));
+            Ok(Ok(Ok((None, _)))) => {
+                if let Err(e) = dao.fail_job(&ctx, job_id, "agentic generation returned no insight")
+                {
+                    log::error!("Failed to mark job {} as failed: {:?}", job_id, e);
+                }
            }
-            Err(_) => {
+            Ok(Ok(Err(e))) => {
+                log::error!("Agentic insight generation failed for {}: {:?}", path, e);
+                if let Err(err) = dao.fail_job(&ctx, job_id, &format!("{:?}", e)) {
+                    log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
+                }
+            }
+            Ok(Err(_)) => {
                log::error!(
                    "Agentic insight generation timed out for {} after {}s",
                    path,
                    timeout_secs
                );
-                let _ = dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs));
+                if let Err(err) =
+                    dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs))
+                {
+                    log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
+                }
+            }
+            Err(_) => {
+                log::error!("Agentic insight generation task panicked for {}", path);
+                if let Err(err) = dao.fail_job(&ctx, job_id, "generation task panicked") {
+                    log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
+                }
            }
        }
+
+        // Remove handle from map on completion
+        let mut handles = job_handles
+            .lock()
+            .expect("Unable to lock InsightJobHandles");
+        handles.remove(&job_id);
    });

-    HttpResponse::Ok().json(JobIdResponse { job_id })
+    // Store abort handle
+    {
+        let mut handles = app_state
+            .insight_job_handles
+            .lock()
+            .expect("Unable to lock InsightJobHandles");
+        handles.insert(job_id, handle.abort_handle());
+    }
+
+    span.set_attribute(KeyValue::new("job_id", job_id as i64));
+    span.set_status(Status::Ok);
+    HttpResponse::Accepted().json(JobIdResponse { job_id })
 }

 /// GET /insights/models - Local-backend models with capabilities. Returns