feature/insight-jobs #102
@@ -1,8 +1,7 @@
|
||||
-- Track async insight generation jobs so the client can poll for
|
||||
-- completion after the server returns 202 Accepted. The UNIQUE
|
||||
-- constraint on (library_id, file_path, generation_type) ensures
|
||||
-- idempotent inserts: if a running job already exists, the caller
|
||||
-- should return that job_id instead of creating a duplicate.
|
||||
-- completion after the server returns 202 Accepted. Each generation
|
||||
-- creates a new row; the application layer cancels prior running
|
||||
-- jobs before inserting.
|
||||
CREATE TABLE insight_generation_jobs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
library_id INTEGER NOT NULL DEFAULT 1,
|
||||
@@ -12,8 +11,7 @@ CREATE TABLE insight_generation_jobs (
|
||||
started_at INTEGER NOT NULL,
|
||||
completed_at INTEGER,
|
||||
result_insight_id INTEGER,
|
||||
error_message TEXT,
|
||||
UNIQUE(library_id, file_path, generation_type)
|
||||
error_message TEXT
|
||||
);
|
||||
|
||||
-- For the status endpoint: fast lookup by (library_id, file_path)
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
-- Restore UNIQUE constraint
|
||||
|
||||
CREATE TABLE insight_generation_jobs_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
library_id INTEGER NOT NULL DEFAULT 1,
|
||||
file_path TEXT NOT NULL,
|
||||
generation_type TEXT NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'running',
|
||||
started_at INTEGER NOT NULL,
|
||||
completed_at INTEGER,
|
||||
result_insight_id INTEGER,
|
||||
error_message TEXT,
|
||||
UNIQUE(library_id, file_path, generation_type)
|
||||
);
|
||||
|
||||
INSERT INTO insight_generation_jobs_new
|
||||
SELECT id, library_id, file_path, generation_type, status, started_at, completed_at, result_insight_id, error_message
|
||||
FROM insight_generation_jobs;
|
||||
|
||||
DROP TABLE insight_generation_jobs;
|
||||
|
||||
ALTER TABLE insight_generation_jobs_new RENAME TO insight_generation_jobs;
|
||||
|
||||
CREATE INDEX idx_insight_gen_jobs_file
|
||||
ON insight_generation_jobs(library_id, file_path);
|
||||
|
||||
CREATE INDEX idx_insight_gen_jobs_status_cleanup
|
||||
ON insight_generation_jobs(status, started_at);
|
||||
@@ -0,0 +1,30 @@
|
||||
-- Remove UNIQUE(library_id, file_path, generation_type) constraint to allow
|
||||
-- multiple job rows per file. This enables proper cancel/regenerate semantics:
|
||||
-- a new job is always inserted on regenerate, and the old job is cancelled
|
||||
-- independently. The application layer prevents concurrent running jobs.
|
||||
|
||||
CREATE TABLE insight_generation_jobs_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
library_id INTEGER NOT NULL DEFAULT 1,
|
||||
file_path TEXT NOT NULL,
|
||||
generation_type TEXT NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'running',
|
||||
started_at INTEGER NOT NULL,
|
||||
completed_at INTEGER,
|
||||
result_insight_id INTEGER,
|
||||
error_message TEXT
|
||||
);
|
||||
|
||||
INSERT INTO insight_generation_jobs_new
|
||||
SELECT id, library_id, file_path, generation_type, status, started_at, completed_at, result_insight_id, error_message
|
||||
FROM insight_generation_jobs;
|
||||
|
||||
DROP TABLE insight_generation_jobs;
|
||||
|
||||
ALTER TABLE insight_generation_jobs_new RENAME TO insight_generation_jobs;
|
||||
|
||||
CREATE INDEX idx_insight_gen_jobs_file
|
||||
ON insight_generation_jobs(library_id, file_path);
|
||||
|
||||
CREATE INDEX idx_insight_gen_jobs_status_cleanup
|
||||
ON insight_generation_jobs(status, started_at);
|
||||
@@ -0,0 +1,11 @@
|
||||
-- SQLite doesn't support DROP COLUMN before 3.35.0; recreate the table
|
||||
-- without the new columns. This is only needed for rollback.
|
||||
CREATE TABLE photo_insights_old AS
|
||||
SELECT id, library_id, rel_path, title, summary, generated_at,
|
||||
model_version, is_current, training_messages, approved,
|
||||
backend, fewshot_source_ids, content_hash
|
||||
FROM photo_insights;
|
||||
|
||||
DROP TABLE photo_insights;
|
||||
|
||||
ALTER TABLE photo_insights_old RENAME TO photo_insights;
|
||||
@@ -0,0 +1,8 @@
|
||||
-- Persist generation parameters on each insight row for auditing.
|
||||
ALTER TABLE photo_insights ADD COLUMN num_ctx INTEGER;
|
||||
ALTER TABLE photo_insights ADD COLUMN temperature REAL;
|
||||
ALTER TABLE photo_insights ADD COLUMN top_p REAL;
|
||||
ALTER TABLE photo_insights ADD COLUMN top_k INTEGER;
|
||||
ALTER TABLE photo_insights ADD COLUMN min_p REAL;
|
||||
ALTER TABLE photo_insights ADD COLUMN system_prompt TEXT;
|
||||
ALTER TABLE photo_insights ADD COLUMN persona_id TEXT;
|
||||
+313
-63
@@ -73,13 +73,13 @@ pub struct GenerationStatusQuery {
|
||||
/// If provided with `library`, look up the latest running job for this
|
||||
/// file. Used when the client doesn't have a persisted job_id.
|
||||
#[serde(default)]
|
||||
pub file_path: Option<String>,
|
||||
pub path: Option<String>,
|
||||
#[serde(default)]
|
||||
pub library: Option<String>,
|
||||
}
|
||||
|
||||
/// GET /insights/generation/status - Check status of a generation job.
|
||||
/// Accepts either `?job_id=<id>` or `?file_path=<path>&library=<name>`.
|
||||
/// Accepts either `?job_id=<id>` or `?path=<path>&library=<name>`.
|
||||
#[get("/insights/generation/status")]
|
||||
pub async fn generation_status_handler(
|
||||
_claims: Claims,
|
||||
@@ -118,7 +118,7 @@ pub async fn generation_status_handler(
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ref fp) = query.file_path {
|
||||
if let Some(ref fp) = query.path {
|
||||
let library = libraries::resolve_library_param(&app_state, query.library.as_deref())
|
||||
.ok()
|
||||
.flatten()
|
||||
@@ -156,7 +156,115 @@ pub async fn generation_status_handler(
|
||||
}
|
||||
|
||||
HttpResponse::BadRequest().json(serde_json::json!({
|
||||
"error": "Provide either job_id or file_path query parameter"
|
||||
"error": "Provide either job_id or path query parameter"
|
||||
}))
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct CancelGenerationRequest {
|
||||
/// If provided, cancel the specific job by id.
|
||||
#[serde(default)]
|
||||
pub job_id: Option<i32>,
|
||||
/// If provided with `library`, cancel all running jobs for this file.
|
||||
#[serde(default)]
|
||||
pub file_path: Option<String>,
|
||||
#[serde(default)]
|
||||
pub library: Option<String>,
|
||||
}
|
||||
|
||||
/// POST /insights/generation/cancel - Cancel a running generation job.
|
||||
/// Accepts either `job_id` or `file_path` + optional `library` in the body.
|
||||
#[post("/insights/generation/cancel")]
|
||||
pub async fn cancel_generation_handler(
|
||||
_claims: Claims,
|
||||
request: web::Json<CancelGenerationRequest>,
|
||||
app_state: web::Data<AppState>,
|
||||
) -> impl Responder {
|
||||
let ctx = opentelemetry::Context::new();
|
||||
|
||||
if let Some(jid) = request.job_id {
|
||||
let mut dao = app_state
|
||||
.insight_job_dao
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobDao");
|
||||
match dao.cancel_job(&ctx, jid) {
|
||||
Ok(true) => {
|
||||
let mut handles = app_state
|
||||
.insight_job_handles
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobHandles");
|
||||
if let Some(handle) = handles.remove(&jid) {
|
||||
handle.abort();
|
||||
}
|
||||
return HttpResponse::Ok().json(serde_json::json!({
|
||||
"success": true,
|
||||
"message": format!("Job {} cancelled", jid)
|
||||
}));
|
||||
}
|
||||
Ok(false) => {
|
||||
return HttpResponse::Ok().json(serde_json::json!({
|
||||
"success": true,
|
||||
"message": format!("Job {} was not running", jid)
|
||||
}));
|
||||
}
|
||||
Err(e) => {
|
||||
log::error!("Failed to cancel job {}: {:?}", jid, e);
|
||||
return HttpResponse::InternalServerError().json(serde_json::json!({
|
||||
"error": "Failed to cancel job"
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ref fp) = request.file_path {
|
||||
let library = libraries::resolve_library_param(&app_state, request.library.as_deref())
|
||||
.ok()
|
||||
.flatten()
|
||||
.unwrap_or_else(|| app_state.primary_library());
|
||||
let normalized = normalize_path(fp);
|
||||
|
||||
// Get active job ids first, then cancel in DB, then abort tasks
|
||||
let active_ids: Vec<i32> = {
|
||||
let mut dao = app_state
|
||||
.insight_job_dao
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobDao");
|
||||
let ids = dao
|
||||
.get_active_job(&ctx, library.id, &normalized)
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|j| vec![j.id])
|
||||
.unwrap_or_default();
|
||||
let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized);
|
||||
ids
|
||||
};
|
||||
|
||||
if active_ids.is_empty() {
|
||||
return HttpResponse::Ok().json(serde_json::json!({
|
||||
"success": true,
|
||||
"message": "No running generation job for this file"
|
||||
}));
|
||||
}
|
||||
|
||||
for jid in &active_ids {
|
||||
if let Some(handle) = app_state
|
||||
.insight_job_handles
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobHandles")
|
||||
.remove(jid)
|
||||
{
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
|
||||
return HttpResponse::Ok().json(serde_json::json!({
|
||||
"success": true,
|
||||
"message": format!("Cancelled {} running job(s) for {}", active_ids.len(), normalized)
|
||||
}));
|
||||
}
|
||||
|
||||
HttpResponse::BadRequest().json(serde_json::json!({
|
||||
"error": "Provide either job_id or file_path in the request body"
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -208,6 +316,20 @@ pub struct PhotoInsightResponse {
|
||||
/// True when the insight was generated agentically and a chat
|
||||
/// continuation can be started against it. Drives the mobile chat button.
|
||||
pub has_training_messages: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub num_ctx: Option<i32>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub temperature: Option<f32>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub top_p: Option<f32>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub top_k: Option<i32>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub min_p: Option<f32>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub system_prompt: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub persona_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
@@ -227,33 +349,55 @@ pub struct ServerModels {
|
||||
/// POST /insights/generate - Generate insight for a specific photo (async)
|
||||
#[post("/insights/generate")]
|
||||
pub async fn generate_insight_handler(
|
||||
_http_request: HttpRequest,
|
||||
http_request: HttpRequest,
|
||||
_claims: Claims,
|
||||
request: web::Json<GeneratePhotoInsightRequest>,
|
||||
app_state: web::Data<AppState>,
|
||||
) -> impl Responder {
|
||||
let parent_context = extract_context_from_request(&http_request);
|
||||
let tracer = global_tracer();
|
||||
let mut span = tracer.start_with_context("http.insights.generate", &parent_context);
|
||||
|
||||
let normalized_path = normalize_path(&request.file_path);
|
||||
let library = app_state.primary_library();
|
||||
let gen_type = InsightGenerationType::Standard;
|
||||
|
||||
span.set_attribute(KeyValue::new("file_path", normalized_path.clone()));
|
||||
if let Some(ref model) = request.model {
|
||||
span.set_attribute(KeyValue::new("model", model.clone()));
|
||||
}
|
||||
|
||||
log::info!(
|
||||
"Manual insight generation triggered for photo: {} with model: {:?}",
|
||||
normalized_path,
|
||||
request.model
|
||||
);
|
||||
|
||||
// Cancel any running job for this file, then create a fresh one
|
||||
{
|
||||
// Look up and abort any running job for this file, then cancel in DB
|
||||
let old_job_ids: Vec<i32> = {
|
||||
let mut dao = app_state
|
||||
.insight_job_dao
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobDao");
|
||||
let _ = dao.cancel_active_job(
|
||||
&opentelemetry::Context::new(),
|
||||
library.id,
|
||||
&normalized_path,
|
||||
gen_type,
|
||||
);
|
||||
let ctx = opentelemetry::Context::new();
|
||||
let ids = dao
|
||||
.get_active_job(&ctx, library.id, &normalized_path)
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|j| vec![j.id])
|
||||
.unwrap_or_default();
|
||||
let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized_path);
|
||||
ids
|
||||
};
|
||||
for jid in &old_job_ids {
|
||||
if let Some(handle) = app_state
|
||||
.insight_job_handles
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobHandles")
|
||||
.remove(jid)
|
||||
{
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
|
||||
let job_id = {
|
||||
@@ -261,7 +405,7 @@ pub async fn generate_insight_handler(
|
||||
.insight_job_dao
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobDao");
|
||||
match dao.create_or_get_active_job(
|
||||
match dao.create_job(
|
||||
&opentelemetry::Context::new(),
|
||||
library.id,
|
||||
&normalized_path,
|
||||
@@ -270,6 +414,7 @@ pub async fn generate_insight_handler(
|
||||
Ok(id) => id,
|
||||
Err(e) => {
|
||||
log::error!("Failed to create generation job: {:?}", e);
|
||||
span.set_status(Status::error("Failed to create generation job"));
|
||||
return HttpResponse::InternalServerError().json(serde_json::json!({
|
||||
"error": "Failed to create generation job"
|
||||
}));
|
||||
@@ -280,19 +425,22 @@ pub async fn generate_insight_handler(
|
||||
// Spawn background task with timeout
|
||||
let generator = app_state.insight_generator.clone();
|
||||
let job_dao = app_state.insight_job_dao.clone();
|
||||
let lib_id = library.id;
|
||||
let job_handles = app_state.insight_job_handles.clone();
|
||||
let path = normalized_path.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let handle = tokio::spawn(async move {
|
||||
let timeout_secs: u64 = std::env::var("INSIGHT_GENERATION_TIMEOUT_SECS")
|
||||
.ok()
|
||||
.and_then(|v| v.parse().ok())
|
||||
.unwrap_or(120);
|
||||
|
||||
let result = tokio::time::timeout(
|
||||
let path_for_task = path.clone();
|
||||
let generator_for_task = generator.clone();
|
||||
let result = tokio::task::spawn(async move {
|
||||
tokio::time::timeout(
|
||||
std::time::Duration::from_secs(timeout_secs),
|
||||
generator.generate_insight_for_photo_with_config(
|
||||
&path,
|
||||
generator_for_task.generate_insight_for_photo_with_config(
|
||||
&path_for_task,
|
||||
request.model.clone(),
|
||||
request.system_prompt.clone(),
|
||||
request.num_ctx,
|
||||
@@ -302,14 +450,15 @@ pub async fn generate_insight_handler(
|
||||
request.min_p,
|
||||
),
|
||||
)
|
||||
.await
|
||||
})
|
||||
.await;
|
||||
|
||||
let ctx = opentelemetry::Context::new();
|
||||
let mut dao = job_dao.lock().expect("Unable to lock InsightJobDao");
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
// Look up the stored insight id to record on the job
|
||||
Ok(Ok(Ok(()))) => {
|
||||
let mut insight_dao = generator
|
||||
.insight_dao()
|
||||
.lock()
|
||||
@@ -320,27 +469,60 @@ pub async fn generate_insight_handler(
|
||||
.flatten()
|
||||
.map(|i| i.id);
|
||||
if let Some(id) = insight_id {
|
||||
let _ = dao.complete_job(&ctx, job_id, id);
|
||||
if let Err(e) = dao.complete_job(&ctx, job_id, id) {
|
||||
log::error!("Failed to mark job {} as completed: {:?}", job_id, e);
|
||||
}
|
||||
} else {
|
||||
let _ = dao.fail_job(&ctx, job_id, "generation returned no insight");
|
||||
if let Err(e) = dao.fail_job(&ctx, job_id, "generation returned no insight") {
|
||||
log::error!("Failed to mark job {} as failed: {:?}", job_id, e);
|
||||
}
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
}
|
||||
Ok(Ok(Err(e))) => {
|
||||
log::error!("Insight generation failed for {}: {:?}", path, e);
|
||||
let _ = dao.fail_job(&ctx, job_id, &format!("{:?}", e));
|
||||
if let Err(err) = dao.fail_job(&ctx, job_id, &format!("{:?}", e)) {
|
||||
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
|
||||
}
|
||||
Err(_) => {
|
||||
}
|
||||
Ok(Err(_)) => {
|
||||
log::error!(
|
||||
"Insight generation timed out for {} after {}s",
|
||||
path,
|
||||
timeout_secs
|
||||
);
|
||||
let _ = dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs));
|
||||
if let Err(err) =
|
||||
dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs))
|
||||
{
|
||||
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
log::error!("Insight generation task panicked for {}", path);
|
||||
if let Err(err) = dao.fail_job(&ctx, job_id, "generation task panicked") {
|
||||
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove handle from map on completion
|
||||
let mut handles = job_handles
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobHandles");
|
||||
handles.remove(&job_id);
|
||||
});
|
||||
|
||||
HttpResponse::Ok().json(JobIdResponse { job_id })
|
||||
// Store abort handle
|
||||
{
|
||||
let mut handles = app_state
|
||||
.insight_job_handles
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobHandles");
|
||||
handles.insert(job_id, handle.abort_handle());
|
||||
}
|
||||
|
||||
span.set_attribute(KeyValue::new("job_id", job_id as i64));
|
||||
span.set_status(Status::Ok);
|
||||
HttpResponse::Accepted().json(JobIdResponse { job_id })
|
||||
}
|
||||
|
||||
/// GET /insights?path=/path/to/photo.jpg - Fetch insight for specific photo
|
||||
@@ -385,6 +567,13 @@ pub async fn get_insight_handler(
|
||||
approved: insight.approved,
|
||||
has_training_messages: insight.training_messages.is_some(),
|
||||
backend: insight.backend,
|
||||
num_ctx: insight.num_ctx,
|
||||
temperature: insight.temperature,
|
||||
top_p: insight.top_p,
|
||||
top_k: insight.top_k,
|
||||
min_p: insight.min_p,
|
||||
system_prompt: insight.system_prompt,
|
||||
persona_id: insight.persona_id,
|
||||
};
|
||||
HttpResponse::Ok().json(response)
|
||||
}
|
||||
@@ -454,6 +643,13 @@ pub async fn get_all_insights_handler(
|
||||
approved: insight.approved,
|
||||
has_training_messages: insight.training_messages.is_some(),
|
||||
backend: insight.backend,
|
||||
num_ctx: insight.num_ctx,
|
||||
temperature: insight.temperature,
|
||||
top_p: insight.top_p,
|
||||
top_k: insight.top_k,
|
||||
min_p: insight.min_p,
|
||||
system_prompt: insight.system_prompt,
|
||||
persona_id: insight.persona_id,
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -471,33 +667,58 @@ pub async fn get_all_insights_handler(
|
||||
/// POST /insights/generate/agentic - Generate insight using agentic tool-calling loop (async)
|
||||
#[post("/insights/generate/agentic")]
|
||||
pub async fn generate_agentic_insight_handler(
|
||||
_http_request: HttpRequest,
|
||||
http_request: HttpRequest,
|
||||
claims: Claims,
|
||||
request: web::Json<GeneratePhotoInsightRequest>,
|
||||
app_state: web::Data<AppState>,
|
||||
) -> impl Responder {
|
||||
let parent_context = extract_context_from_request(&http_request);
|
||||
let tracer = global_tracer();
|
||||
let mut span = tracer.start_with_context("http.insights.generate_agentic", &parent_context);
|
||||
|
||||
let normalized_path = normalize_path(&request.file_path);
|
||||
let library = app_state.primary_library();
|
||||
let gen_type = InsightGenerationType::Agentic;
|
||||
|
||||
span.set_attribute(KeyValue::new("file_path", normalized_path.clone()));
|
||||
if let Some(ref model) = request.model {
|
||||
span.set_attribute(KeyValue::new("model", model.clone()));
|
||||
}
|
||||
if let Some(ref backend) = request.backend {
|
||||
span.set_attribute(KeyValue::new("backend", backend.clone()));
|
||||
}
|
||||
|
||||
log::info!(
|
||||
"Agentic insight generation triggered for photo: {} with model: {:?}",
|
||||
normalized_path,
|
||||
request.model
|
||||
);
|
||||
|
||||
// Cancel any running job for this file, then create a fresh one
|
||||
{
|
||||
// Look up and abort any running job for this file, then cancel in DB
|
||||
let old_job_ids: Vec<i32> = {
|
||||
let mut dao = app_state
|
||||
.insight_job_dao
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobDao");
|
||||
let _ = dao.cancel_active_job(
|
||||
&opentelemetry::Context::new(),
|
||||
library.id,
|
||||
&normalized_path,
|
||||
gen_type,
|
||||
);
|
||||
let ctx = opentelemetry::Context::new();
|
||||
let ids = dao
|
||||
.get_active_job(&ctx, library.id, &normalized_path)
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|j| vec![j.id])
|
||||
.unwrap_or_default();
|
||||
let _ = dao.cancel_active_jobs(&ctx, library.id, &normalized_path);
|
||||
ids
|
||||
};
|
||||
for jid in &old_job_ids {
|
||||
if let Some(handle) = app_state
|
||||
.insight_job_handles
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobHandles")
|
||||
.remove(jid)
|
||||
{
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
|
||||
let job_id = {
|
||||
@@ -505,7 +726,7 @@ pub async fn generate_agentic_insight_handler(
|
||||
.insight_job_dao
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobDao");
|
||||
match dao.create_or_get_active_job(
|
||||
match dao.create_job(
|
||||
&opentelemetry::Context::new(),
|
||||
library.id,
|
||||
&normalized_path,
|
||||
@@ -514,6 +735,7 @@ pub async fn generate_agentic_insight_handler(
|
||||
Ok(id) => id,
|
||||
Err(e) => {
|
||||
log::error!("Failed to create agentic generation job: {:?}", e);
|
||||
span.set_status(Status::error("Failed to create generation job"));
|
||||
return HttpResponse::InternalServerError().json(serde_json::json!({
|
||||
"error": "Failed to create generation job"
|
||||
}));
|
||||
@@ -573,19 +795,22 @@ pub async fn generate_agentic_insight_handler(
|
||||
// Spawn background task with timeout
|
||||
let generator = app_state.insight_generator.clone();
|
||||
let job_dao = app_state.insight_job_dao.clone();
|
||||
let lib_id = library.id;
|
||||
let job_handles = app_state.insight_job_handles.clone();
|
||||
let path = normalized_path.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let handle = tokio::spawn(async move {
|
||||
let timeout_secs: u64 = std::env::var("INSIGHT_GENERATION_TIMEOUT_SECS")
|
||||
.ok()
|
||||
.and_then(|v| v.parse().ok())
|
||||
.unwrap_or(180);
|
||||
|
||||
let result = tokio::time::timeout(
|
||||
let path_for_task = path.clone();
|
||||
let generator_for_task = generator.clone();
|
||||
let result = tokio::task::spawn(async move {
|
||||
tokio::time::timeout(
|
||||
std::time::Duration::from_secs(timeout_secs),
|
||||
generator.generate_agentic_insight_for_photo(
|
||||
&path,
|
||||
generator_for_task.generate_agentic_insight_for_photo(
|
||||
&path_for_task,
|
||||
request.model.clone(),
|
||||
request.system_prompt.clone(),
|
||||
request.num_ctx,
|
||||
@@ -601,45 +826,70 @@ pub async fn generate_agentic_insight_handler(
|
||||
persona_id,
|
||||
),
|
||||
)
|
||||
.await
|
||||
})
|
||||
.await;
|
||||
|
||||
let ctx = opentelemetry::Context::new();
|
||||
let mut dao = job_dao.lock().expect("Unable to lock InsightJobDao");
|
||||
|
||||
match result {
|
||||
Ok(Ok(_)) => {
|
||||
// Fetch the stored insight id to record on the job
|
||||
let mut insight_dao = generator
|
||||
.insight_dao()
|
||||
.lock()
|
||||
.expect("Unable to lock InsightDao");
|
||||
let insight_id = insight_dao
|
||||
.get_insight(&ctx, &path)
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|i| i.id);
|
||||
if let Some(id) = insight_id {
|
||||
let _ = dao.complete_job(&ctx, job_id, id);
|
||||
} else {
|
||||
let _ = dao.fail_job(&ctx, job_id, "generation returned no insight");
|
||||
Ok(Ok(Ok((Some(insight_id), _)))) => {
|
||||
if let Err(e) = dao.complete_job(&ctx, job_id, insight_id) {
|
||||
log::error!("Failed to mark job {} as completed: {:?}", job_id, e);
|
||||
}
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
Ok(Ok(Ok((None, _)))) => {
|
||||
if let Err(e) = dao.fail_job(&ctx, job_id, "agentic generation returned no insight")
|
||||
{
|
||||
log::error!("Failed to mark job {} as failed: {:?}", job_id, e);
|
||||
}
|
||||
}
|
||||
Ok(Ok(Err(e))) => {
|
||||
log::error!("Agentic insight generation failed for {}: {:?}", path, e);
|
||||
let _ = dao.fail_job(&ctx, job_id, &format!("{:?}", e));
|
||||
if let Err(err) = dao.fail_job(&ctx, job_id, &format!("{:?}", e)) {
|
||||
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
|
||||
}
|
||||
Err(_) => {
|
||||
}
|
||||
Ok(Err(_)) => {
|
||||
log::error!(
|
||||
"Agentic insight generation timed out for {} after {}s",
|
||||
path,
|
||||
timeout_secs
|
||||
);
|
||||
let _ = dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs));
|
||||
if let Err(err) =
|
||||
dao.fail_job(&ctx, job_id, &format!("timeout after {}s", timeout_secs))
|
||||
{
|
||||
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
log::error!("Agentic insight generation task panicked for {}", path);
|
||||
if let Err(err) = dao.fail_job(&ctx, job_id, "generation task panicked") {
|
||||
log::error!("Failed to mark job {} as failed: {:?}", job_id, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove handle from map on completion
|
||||
let mut handles = job_handles
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobHandles");
|
||||
handles.remove(&job_id);
|
||||
});
|
||||
|
||||
HttpResponse::Ok().json(JobIdResponse { job_id })
|
||||
// Store abort handle
|
||||
{
|
||||
let mut handles = app_state
|
||||
.insight_job_handles
|
||||
.lock()
|
||||
.expect("Unable to lock InsightJobHandles");
|
||||
handles.insert(job_id, handle.abort_handle());
|
||||
}
|
||||
|
||||
span.set_attribute(KeyValue::new("job_id", job_id as i64));
|
||||
span.set_status(Status::Ok);
|
||||
HttpResponse::Accepted().json(JobIdResponse { job_id })
|
||||
}
|
||||
|
||||
/// GET /insights/models - Local-backend models with capabilities. Returns
|
||||
|
||||
@@ -517,6 +517,13 @@ impl InsightChatService {
|
||||
backend: kind.as_str().to_string(),
|
||||
fewshot_source_ids: None,
|
||||
content_hash: None,
|
||||
num_ctx: req.num_ctx,
|
||||
temperature: req.temperature,
|
||||
top_p: req.top_p,
|
||||
top_k: req.top_k,
|
||||
min_p: req.min_p,
|
||||
system_prompt: req.system_prompt.clone(),
|
||||
persona_id: req.persona_id.clone(),
|
||||
};
|
||||
let cx = opentelemetry::Context::new();
|
||||
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
||||
@@ -864,6 +871,13 @@ impl InsightChatService {
|
||||
backend: kind.as_str().to_string(),
|
||||
fewshot_source_ids: None,
|
||||
content_hash: None,
|
||||
num_ctx: req.num_ctx,
|
||||
temperature: req.temperature,
|
||||
top_p: req.top_p,
|
||||
top_k: req.top_k,
|
||||
min_p: req.min_p,
|
||||
system_prompt: req.system_prompt.clone(),
|
||||
persona_id: req.persona_id.clone(),
|
||||
};
|
||||
let cx = opentelemetry::Context::new();
|
||||
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
||||
@@ -1052,6 +1066,13 @@ impl InsightChatService {
|
||||
backend: kind.as_str().to_string(),
|
||||
fewshot_source_ids: None,
|
||||
content_hash: None,
|
||||
num_ctx: req.num_ctx,
|
||||
temperature: req.temperature,
|
||||
top_p: req.top_p,
|
||||
top_k: req.top_k,
|
||||
min_p: req.min_p,
|
||||
system_prompt: req.system_prompt.clone(),
|
||||
persona_id: req.persona_id.clone(),
|
||||
};
|
||||
let stored = {
|
||||
let cx = opentelemetry::Context::new();
|
||||
|
||||
@@ -1432,6 +1432,13 @@ impl InsightGenerator {
|
||||
backend: "local".to_string(),
|
||||
fewshot_source_ids: None,
|
||||
content_hash: None,
|
||||
num_ctx,
|
||||
temperature,
|
||||
top_p,
|
||||
top_k,
|
||||
min_p,
|
||||
system_prompt: custom_system_prompt.clone(),
|
||||
persona_id: None,
|
||||
};
|
||||
|
||||
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
||||
@@ -4176,6 +4183,13 @@ Return ONLY the summary, nothing else."#,
|
||||
backend: kind.as_str().to_string(),
|
||||
fewshot_source_ids: fewshot_source_ids_json,
|
||||
content_hash: None,
|
||||
num_ctx,
|
||||
temperature,
|
||||
top_p,
|
||||
top_k,
|
||||
min_p,
|
||||
system_prompt: custom_system_prompt.clone(),
|
||||
persona_id: Some(persona_id.clone()),
|
||||
};
|
||||
|
||||
let stored = {
|
||||
|
||||
+5
-5
@@ -19,11 +19,11 @@ pub use daily_summary_job::{
|
||||
generate_daily_summaries, strip_summary_boilerplate,
|
||||
};
|
||||
pub use handlers::{
|
||||
chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler,
|
||||
delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
|
||||
generate_insight_handler, generation_status_handler, get_all_insights_handler,
|
||||
get_available_models_handler, get_insight_handler, get_openrouter_models_handler,
|
||||
rate_insight_handler,
|
||||
cancel_generation_handler, chat_history_handler, chat_rewind_handler, chat_stream_handler,
|
||||
chat_turn_handler, delete_insight_handler, export_training_data_handler,
|
||||
generate_agentic_insight_handler, generate_insight_handler, generation_status_handler,
|
||||
get_all_insights_handler, get_available_models_handler, get_insight_handler,
|
||||
get_openrouter_models_handler, rate_insight_handler,
|
||||
};
|
||||
pub use insight_generator::InsightGenerator;
|
||||
pub use llamacpp::LlamaCppClient;
|
||||
|
||||
@@ -10,13 +10,13 @@ use crate::database::schema;
|
||||
use crate::database::{DbError, DbErrorKind, connect};
|
||||
use crate::otel::trace_db_call;
|
||||
|
||||
/// Tracks async insight generation jobs. The idempotent insert ensures
|
||||
/// concurrent callers for the same (library_id, file_path, generation_type)
|
||||
/// get the same job_id rather than creating duplicates.
|
||||
/// Tracks async insight generation jobs. Each call to `create_job` inserts
|
||||
/// a new row; the application layer prevents concurrent running jobs by
|
||||
/// cancelling the old one before creating a new one.
|
||||
pub trait InsightGenerationJobDao: Sync + Send {
|
||||
/// Insert a new job or return the existing running job for the same key.
|
||||
/// Returns the job_id either way.
|
||||
fn create_or_get_active_job(
|
||||
/// Insert a new running job. Always creates a new row (no upsert).
|
||||
/// Cleans up terminal-state rows for the same key first.
|
||||
fn create_job(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
@@ -24,7 +24,9 @@ pub trait InsightGenerationJobDao: Sync + Send {
|
||||
generation_type: InsightGenerationType,
|
||||
) -> Result<i32, DbError>;
|
||||
|
||||
/// Mark a job as completed with the resulting insight id.
|
||||
/// Mark a job as completed with the resulting insight id. Only updates
|
||||
/// if the job is still in "running" status (prevents overwriting a
|
||||
/// cancelled job with a late-completing task).
|
||||
fn complete_job(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
@@ -32,7 +34,8 @@ pub trait InsightGenerationJobDao: Sync + Send {
|
||||
insight_id: i32,
|
||||
) -> Result<(), DbError>;
|
||||
|
||||
/// Mark a job as failed with an error message.
|
||||
/// Mark a job as failed with an error message. Only updates if the job
|
||||
/// is still in "running" status.
|
||||
fn fail_job(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
@@ -40,15 +43,22 @@ pub trait InsightGenerationJobDao: Sync + Send {
|
||||
error_message: &str,
|
||||
) -> Result<(), DbError>;
|
||||
|
||||
/// Mark the active running job for a file as "cancelled". Returns true if
|
||||
/// a job was found and cancelled, false if no running job existed.
|
||||
fn cancel_active_job(
|
||||
/// Cancel a specific job by id. Only updates if the job is still
|
||||
/// in "running" status. Returns true if a row was updated.
|
||||
fn cancel_job(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
job_id: i32,
|
||||
) -> Result<bool, DbError>;
|
||||
|
||||
/// Cancel all running jobs for a given file. Returns the number of
|
||||
/// jobs cancelled.
|
||||
fn cancel_active_jobs(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
file_path: &str,
|
||||
generation_type: InsightGenerationType,
|
||||
) -> Result<bool, DbError>;
|
||||
) -> Result<usize, DbError>;
|
||||
|
||||
/// Find the latest running job for a given file. Returns None if no
|
||||
/// running job exists.
|
||||
@@ -65,6 +75,11 @@ pub trait InsightGenerationJobDao: Sync + Send {
|
||||
context: &opentelemetry::Context,
|
||||
job_id: i32,
|
||||
) -> Result<Option<InsightGenerationJob>, DbError>;
|
||||
|
||||
/// Mark all jobs still in "running" status as "failed" with a recovery
|
||||
/// error message. Returns the number of jobs recovered.
|
||||
fn recover_orphaned_jobs(&mut self, context: &opentelemetry::Context)
|
||||
-> Result<usize, DbError>;
|
||||
}
|
||||
|
||||
pub struct SqliteInsightGenerationJobDao {
|
||||
@@ -91,14 +106,14 @@ impl SqliteInsightGenerationJobDao {
|
||||
}
|
||||
|
||||
impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
fn create_or_get_active_job(
|
||||
fn create_job(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
file_path: &str,
|
||||
generation_type: InsightGenerationType,
|
||||
) -> Result<i32, DbError> {
|
||||
trace_db_call(context, "insert", "create_or_get_active_job", |_span| {
|
||||
trace_db_call(context, "insert", "create_job", |_span| {
|
||||
use schema::insight_generation_jobs::dsl;
|
||||
|
||||
let mut connection = self
|
||||
@@ -106,26 +121,6 @@ impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
.lock()
|
||||
.expect("Unable to lock InsightGenerationJobDao");
|
||||
|
||||
// Check for existing running job
|
||||
let existing = dsl::insight_generation_jobs
|
||||
.filter(
|
||||
dsl::library_id
|
||||
.eq(library_id)
|
||||
.and(dsl::file_path.eq(file_path))
|
||||
.and(dsl::generation_type.eq(generation_type.as_str()))
|
||||
.and(dsl::status.eq(InsightJobStatus::Running.as_str())),
|
||||
)
|
||||
.select(dsl::id)
|
||||
.first::<i32>(connection.deref_mut())
|
||||
.optional();
|
||||
|
||||
match existing {
|
||||
Ok(Some(job_id)) => return Ok(job_id),
|
||||
Ok(None) => {}
|
||||
Err(e) => return Err(anyhow::anyhow!("Failed to check existing job: {}", e)),
|
||||
}
|
||||
|
||||
// No running job exists, insert new one (upsert on conflict)
|
||||
let now = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.expect("Time went backwards")
|
||||
@@ -141,22 +136,16 @@ impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
|
||||
diesel::insert_into(dsl::insight_generation_jobs)
|
||||
.values(&new_job)
|
||||
.on_conflict((dsl::library_id, dsl::file_path, dsl::generation_type))
|
||||
.do_update()
|
||||
.set((
|
||||
dsl::status.eq(InsightJobStatus::Running.as_str()),
|
||||
dsl::started_at.eq(now),
|
||||
))
|
||||
.execute(connection.deref_mut())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to insert job: {}", e))?;
|
||||
|
||||
// Get the job id
|
||||
dsl::insight_generation_jobs
|
||||
.filter(
|
||||
dsl::library_id
|
||||
.eq(library_id)
|
||||
.and(dsl::file_path.eq(file_path))
|
||||
.and(dsl::generation_type.eq(generation_type.as_str())),
|
||||
.and(dsl::generation_type.eq(generation_type.as_str()))
|
||||
.and(dsl::status.eq(InsightJobStatus::Running.as_str())),
|
||||
)
|
||||
.select(dsl::id)
|
||||
.order(dsl::id.desc())
|
||||
@@ -185,7 +174,15 @@ impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
.expect("Time went backwards")
|
||||
.as_secs() as i64;
|
||||
|
||||
diesel::update(dsl::insight_generation_jobs.filter(dsl::id.eq(job_id)))
|
||||
// Only update if still running — prevents cancelled job from
|
||||
// being overwritten by a late-completing task.
|
||||
diesel::update(
|
||||
dsl::insight_generation_jobs.filter(
|
||||
dsl::id
|
||||
.eq(job_id)
|
||||
.and(dsl::status.eq(InsightJobStatus::Running.as_str())),
|
||||
),
|
||||
)
|
||||
.set((
|
||||
dsl::status.eq(InsightJobStatus::Completed.as_str()),
|
||||
dsl::completed_at.eq(Some(now)),
|
||||
@@ -217,7 +214,14 @@ impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
.expect("Time went backwards")
|
||||
.as_secs() as i64;
|
||||
|
||||
diesel::update(dsl::insight_generation_jobs.filter(dsl::id.eq(job_id)))
|
||||
// Only update if still running.
|
||||
diesel::update(
|
||||
dsl::insight_generation_jobs.filter(
|
||||
dsl::id
|
||||
.eq(job_id)
|
||||
.and(dsl::status.eq(InsightJobStatus::Running.as_str())),
|
||||
),
|
||||
)
|
||||
.set((
|
||||
dsl::status.eq(InsightJobStatus::Failed.as_str()),
|
||||
dsl::completed_at.eq(Some(now)),
|
||||
@@ -230,14 +234,51 @@ impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
|
||||
}
|
||||
|
||||
fn cancel_active_job(
|
||||
fn cancel_job(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
job_id: i32,
|
||||
) -> Result<bool, DbError> {
|
||||
trace_db_call(context, "update", "cancel_job", |_span| {
|
||||
use schema::insight_generation_jobs::dsl;
|
||||
|
||||
let mut connection = self
|
||||
.connection
|
||||
.lock()
|
||||
.expect("Unable to lock InsightGenerationJobDao");
|
||||
|
||||
let now = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.expect("Time went backwards")
|
||||
.as_secs() as i64;
|
||||
|
||||
let rows = diesel::update(
|
||||
dsl::insight_generation_jobs.filter(
|
||||
dsl::id
|
||||
.eq(job_id)
|
||||
.and(dsl::status.eq(InsightJobStatus::Running.as_str())),
|
||||
),
|
||||
)
|
||||
.set((
|
||||
dsl::status.eq(InsightJobStatus::Cancelled.as_str()),
|
||||
dsl::completed_at.eq(Some(now)),
|
||||
dsl::error_message.eq(Some("cancelled by user".to_string())),
|
||||
))
|
||||
.execute(connection.deref_mut())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to cancel job: {}", e))?;
|
||||
|
||||
Ok(rows > 0)
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
|
||||
}
|
||||
|
||||
fn cancel_active_jobs(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
file_path: &str,
|
||||
generation_type: InsightGenerationType,
|
||||
) -> Result<bool, DbError> {
|
||||
trace_db_call(context, "update", "cancel_active_job", |_span| {
|
||||
) -> Result<usize, DbError> {
|
||||
trace_db_call(context, "update", "cancel_active_jobs", |_span| {
|
||||
use schema::insight_generation_jobs::dsl;
|
||||
|
||||
let mut connection = self
|
||||
@@ -255,7 +296,6 @@ impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
dsl::library_id
|
||||
.eq(library_id)
|
||||
.and(dsl::file_path.eq(file_path))
|
||||
.and(dsl::generation_type.eq(generation_type.as_str()))
|
||||
.and(dsl::status.eq(InsightJobStatus::Running.as_str())),
|
||||
),
|
||||
)
|
||||
@@ -265,9 +305,9 @@ impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
dsl::error_message.eq(Some("cancelled by newer request".to_string())),
|
||||
))
|
||||
.execute(connection.deref_mut())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to cancel job: {}", e))?;
|
||||
.map_err(|e| anyhow::anyhow!("Failed to cancel active jobs: {}", e))?;
|
||||
|
||||
Ok(rows > 0)
|
||||
Ok(rows)
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
|
||||
}
|
||||
@@ -322,6 +362,40 @@ impl InsightGenerationJobDao for SqliteInsightGenerationJobDao {
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn recover_orphaned_jobs(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
) -> Result<usize, DbError> {
|
||||
trace_db_call(context, "update", "recover_orphaned_jobs", |_span| {
|
||||
use schema::insight_generation_jobs::dsl;
|
||||
|
||||
let mut connection = self
|
||||
.connection
|
||||
.lock()
|
||||
.expect("Unable to lock InsightGenerationJobDao");
|
||||
|
||||
let now = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.expect("Time went backwards")
|
||||
.as_secs() as i64;
|
||||
|
||||
let rows = diesel::update(
|
||||
dsl::insight_generation_jobs
|
||||
.filter(dsl::status.eq(InsightJobStatus::Running.as_str())),
|
||||
)
|
||||
.set((
|
||||
dsl::status.eq(InsightJobStatus::Failed.as_str()),
|
||||
dsl::completed_at.eq(Some(now)),
|
||||
dsl::error_message.eq(Some("server crashed while running".to_string())),
|
||||
))
|
||||
.execute(connection.deref_mut())
|
||||
.map_err(|e| anyhow::anyhow!("Failed to recover orphaned jobs: {}", e))?;
|
||||
|
||||
Ok(rows)
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -345,22 +419,19 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_job_idempotent() {
|
||||
fn create_job_inserts_new_row() {
|
||||
let mut dao = setup_dao();
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id_1 = dao
|
||||
.create_or_get_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
let job_id_2 = dao
|
||||
.create_or_get_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
job_id_1, job_id_2,
|
||||
"idempotent insert should return same job_id"
|
||||
);
|
||||
assert_ne!(job_id_1, job_id_2, "each create_job call inserts a new row");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -369,7 +440,7 @@ mod tests {
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id = dao
|
||||
.create_or_get_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
dao.complete_job(&ctx, job_id, 42).unwrap();
|
||||
@@ -386,7 +457,7 @@ mod tests {
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id = dao
|
||||
.create_or_get_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Agentic)
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Agentic)
|
||||
.unwrap();
|
||||
|
||||
dao.fail_job(&ctx, job_id, "model timeout").unwrap();
|
||||
@@ -403,7 +474,7 @@ mod tests {
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id = dao
|
||||
.create_or_get_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
// Job is running
|
||||
@@ -420,18 +491,16 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cancel_active_job() {
|
||||
fn cancel_active_jobs() {
|
||||
let mut dao = setup_dao();
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id = dao
|
||||
.create_or_get_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
let cancelled = dao
|
||||
.cancel_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
assert!(cancelled, "should cancel existing running job");
|
||||
let cancelled = dao.cancel_active_jobs(&ctx, 1, "photos/test.jpg").unwrap();
|
||||
assert_eq!(cancelled, 1, "should cancel 1 running job");
|
||||
|
||||
// Job is no longer active
|
||||
let active = dao.get_active_job(&ctx, 1, "photos/test.jpg").unwrap();
|
||||
@@ -441,11 +510,9 @@ mod tests {
|
||||
let job = dao.get_job_by_id(&ctx, job_id).unwrap().unwrap();
|
||||
assert_eq!(job.status, InsightJobStatus::Cancelled.as_str());
|
||||
|
||||
// Cancelling again returns false (nothing to cancel)
|
||||
let cancelled2 = dao
|
||||
.cancel_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
assert!(!cancelled2, "should return false when no running job");
|
||||
// Cancelling again returns 0 (nothing to cancel)
|
||||
let cancelled2 = dao.cancel_active_jobs(&ctx, 1, "photos/test.jpg").unwrap();
|
||||
assert_eq!(cancelled2, 0, "should return 0 when no running job");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -454,11 +521,11 @@ mod tests {
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id_1 = dao
|
||||
.create_or_get_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
let job_id_2 = dao
|
||||
.create_or_get_active_job(&ctx, 2, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.create_job(&ctx, 2, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
assert_ne!(
|
||||
@@ -485,7 +552,7 @@ mod tests {
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id = dao
|
||||
.create_or_get_active_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
// Find while running
|
||||
@@ -500,4 +567,115 @@ mod tests {
|
||||
assert_eq!(job.status, InsightJobStatus::Completed.as_str());
|
||||
assert_eq!(job.result_insight_id, Some(99));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn recover_orphaned_jobs() {
|
||||
let mut dao = setup_dao();
|
||||
let ctx = ctx();
|
||||
|
||||
// Create two running jobs
|
||||
let job_id_1 = dao
|
||||
.create_job(&ctx, 1, "photos/a.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
let job_id_2 = dao
|
||||
.create_job(&ctx, 1, "photos/b.jpg", InsightGenerationType::Agentic)
|
||||
.unwrap();
|
||||
|
||||
// Complete one
|
||||
dao.complete_job(&ctx, job_id_1, 1).unwrap();
|
||||
|
||||
// Recover should only affect the running job
|
||||
let recovered = dao.recover_orphaned_jobs(&ctx).unwrap();
|
||||
assert_eq!(recovered, 1, "should recover exactly 1 running job");
|
||||
|
||||
// job_id_1 is still completed
|
||||
let job1 = dao.get_job_by_id(&ctx, job_id_1).unwrap().unwrap();
|
||||
assert_eq!(job1.status, InsightJobStatus::Completed.as_str());
|
||||
|
||||
// job_id_2 is now failed with recovery message
|
||||
let job2 = dao.get_job_by_id(&ctx, job_id_2).unwrap().unwrap();
|
||||
assert_eq!(job2.status, InsightJobStatus::Failed.as_str());
|
||||
assert_eq!(
|
||||
job2.error_message.as_deref(),
|
||||
Some("server crashed while running")
|
||||
);
|
||||
|
||||
// Second recovery is a no-op
|
||||
let recovered2 = dao.recover_orphaned_jobs(&ctx).unwrap();
|
||||
assert_eq!(recovered2, 0, "no running jobs remain");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn complete_job_noop_when_cancelled() {
|
||||
let mut dao = setup_dao();
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id = dao
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
dao.cancel_job(&ctx, job_id).unwrap();
|
||||
|
||||
// Late-completing task tries to mark as completed — should be a no-op
|
||||
dao.complete_job(&ctx, job_id, 42).unwrap();
|
||||
|
||||
let job = dao.get_job_by_id(&ctx, job_id).unwrap().unwrap();
|
||||
assert_eq!(
|
||||
job.status,
|
||||
InsightJobStatus::Cancelled.as_str(),
|
||||
"cancelled status must not be overwritten by late complete"
|
||||
);
|
||||
assert_eq!(
|
||||
job.result_insight_id, None,
|
||||
"insight_id must stay None when complete is a no-op"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fail_job_noop_when_cancelled() {
|
||||
let mut dao = setup_dao();
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id = dao
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Agentic)
|
||||
.unwrap();
|
||||
|
||||
dao.cancel_job(&ctx, job_id).unwrap();
|
||||
|
||||
// Late-failing task tries to mark as failed — should be a no-op
|
||||
dao.fail_job(&ctx, job_id, "timeout after 120s").unwrap();
|
||||
|
||||
let job = dao.get_job_by_id(&ctx, job_id).unwrap().unwrap();
|
||||
assert_eq!(
|
||||
job.status,
|
||||
InsightJobStatus::Cancelled.as_str(),
|
||||
"cancelled status must not be overwritten by late fail"
|
||||
);
|
||||
assert_eq!(
|
||||
job.error_message.as_deref(),
|
||||
Some("cancelled by user"),
|
||||
"error_message must reflect the cancel, not the late fail"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cancel_job_by_id() {
|
||||
let mut dao = setup_dao();
|
||||
let ctx = ctx();
|
||||
|
||||
let job_id = dao
|
||||
.create_job(&ctx, 1, "photos/test.jpg", InsightGenerationType::Standard)
|
||||
.unwrap();
|
||||
|
||||
let cancelled = dao.cancel_job(&ctx, job_id).unwrap();
|
||||
assert!(cancelled, "should cancel running job");
|
||||
|
||||
let job = dao.get_job_by_id(&ctx, job_id).unwrap().unwrap();
|
||||
assert_eq!(job.status, InsightJobStatus::Cancelled.as_str());
|
||||
assert!(job.completed_at.is_some());
|
||||
|
||||
// Cancelling again is a no-op
|
||||
let cancelled2 = dao.cancel_job(&ctx, job_id).unwrap();
|
||||
assert!(!cancelled2, "already cancelled job should return false");
|
||||
}
|
||||
}
|
||||
|
||||
+21
-1
@@ -38,7 +38,13 @@ impl InsightJobStatus {
|
||||
"completed" => Self::Completed,
|
||||
"failed" => Self::Failed,
|
||||
"cancelled" => Self::Cancelled,
|
||||
_ => Self::Failed,
|
||||
other => {
|
||||
log::warn!(
|
||||
"Unknown InsightJobStatus value: {:?}, treating as failed",
|
||||
other
|
||||
);
|
||||
Self::Failed
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -224,6 +230,13 @@ pub struct InsertPhotoInsight {
|
||||
/// inserted before the hash is available stay null and the
|
||||
/// reconciliation pass backfills them.
|
||||
pub content_hash: Option<String>,
|
||||
pub num_ctx: Option<i32>,
|
||||
pub temperature: Option<f32>,
|
||||
pub top_p: Option<f32>,
|
||||
pub top_k: Option<i32>,
|
||||
pub min_p: Option<f32>,
|
||||
pub system_prompt: Option<String>,
|
||||
pub persona_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Queryable, Clone, Debug)]
|
||||
@@ -243,6 +256,13 @@ pub struct PhotoInsight {
|
||||
pub backend: String,
|
||||
pub fewshot_source_ids: Option<String>,
|
||||
pub content_hash: Option<String>,
|
||||
pub num_ctx: Option<i32>,
|
||||
pub temperature: Option<f32>,
|
||||
pub top_p: Option<f32>,
|
||||
pub top_k: Option<i32>,
|
||||
pub min_p: Option<f32>,
|
||||
pub system_prompt: Option<String>,
|
||||
pub persona_id: Option<String>,
|
||||
}
|
||||
|
||||
// --- Libraries ---
|
||||
|
||||
@@ -216,6 +216,13 @@ diesel::table! {
|
||||
backend -> Text,
|
||||
fewshot_source_ids -> Nullable<Text>,
|
||||
content_hash -> Nullable<Text>,
|
||||
num_ctx -> Nullable<Integer>,
|
||||
temperature -> Nullable<Float>,
|
||||
top_p -> Nullable<Float>,
|
||||
top_k -> Nullable<Integer>,
|
||||
min_p -> Nullable<Float>,
|
||||
system_prompt -> Nullable<Text>,
|
||||
persona_id -> Nullable<Text>,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+17
@@ -75,6 +75,22 @@ fn main() -> std::io::Result<()> {
|
||||
|
||||
run_migrations(&mut connect()).expect("Failed to run migrations");
|
||||
|
||||
// Recover orphaned insight generation jobs from a previous crash.
|
||||
{
|
||||
use crate::database::{InsightGenerationJobDao, SqliteInsightGenerationJobDao};
|
||||
let mut dao = SqliteInsightGenerationJobDao::new();
|
||||
let ctx = opentelemetry::Context::new();
|
||||
match dao.recover_orphaned_jobs(&ctx) {
|
||||
Ok(n) if n > 0 => {
|
||||
info!("Recovered {} orphaned insight generation jobs", n);
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
log::warn!("Failed to recover orphaned insight jobs: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// One-shot retirement of the pre-content-hash HLS layout. Idempotent
|
||||
// — a second boot finds nothing and reports zero deletions, so it's
|
||||
// safe to leave wired in until the module is removed in a later
|
||||
@@ -309,6 +325,7 @@ fn main() -> std::io::Result<()> {
|
||||
.service(ai::generate_insight_handler)
|
||||
.service(ai::generate_agentic_insight_handler)
|
||||
.service(ai::generation_status_handler)
|
||||
.service(ai::cancel_generation_handler)
|
||||
.service(ai::get_insight_handler)
|
||||
.service(ai::delete_insight_handler)
|
||||
.service(ai::get_all_insights_handler)
|
||||
|
||||
@@ -19,6 +19,7 @@ use crate::video::actors::{
|
||||
PlaylistGenerator, PreviewClipGenerator, StreamActor, VideoPlaylistManager,
|
||||
};
|
||||
use actix::{Actor, Addr};
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::sync::{Arc, Mutex, RwLock};
|
||||
|
||||
@@ -88,6 +89,9 @@ pub struct AppState {
|
||||
pub clip_client: ClipClient,
|
||||
/// Tracks async insight generation jobs (spawned by generate endpoints).
|
||||
pub insight_job_dao: Arc<Mutex<Box<dyn InsightGenerationJobDao>>>,
|
||||
/// In-memory map from job_id → tokio AbortHandle for running tasks.
|
||||
/// Used to abort server-side tasks on cancel or regenerate.
|
||||
pub insight_job_handles: Arc<Mutex<HashMap<i32, tokio::task::AbortHandle>>>,
|
||||
}
|
||||
|
||||
impl AppState {
|
||||
@@ -127,6 +131,7 @@ impl AppState {
|
||||
face_client: FaceClient,
|
||||
clip_client: ClipClient,
|
||||
insight_job_dao: Arc<Mutex<Box<dyn InsightGenerationJobDao>>>,
|
||||
insight_job_handles: Arc<Mutex<HashMap<i32, tokio::task::AbortHandle>>>,
|
||||
) -> Self {
|
||||
assert!(
|
||||
!libraries_vec.is_empty(),
|
||||
@@ -169,6 +174,7 @@ impl AppState {
|
||||
face_client,
|
||||
clip_client,
|
||||
insight_job_dao,
|
||||
insight_job_handles,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,6 +266,8 @@ impl Default for AppState {
|
||||
// Initialize insight generation job DAO (async generation tracking)
|
||||
let insight_job_dao: Arc<Mutex<Box<dyn InsightGenerationJobDao>>> =
|
||||
Arc::new(Mutex::new(Box::new(SqliteInsightGenerationJobDao::new())));
|
||||
let insight_job_handles: Arc<Mutex<HashMap<i32, tokio::task::AbortHandle>>> =
|
||||
Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
// Load base path and ensure the primary library row reflects it.
|
||||
let base_path = env::var("BASE_PATH").expect("BASE_PATH was not set in the env");
|
||||
@@ -328,6 +336,7 @@ impl Default for AppState {
|
||||
face_client,
|
||||
clip_client,
|
||||
insight_job_dao,
|
||||
insight_job_handles,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -513,6 +522,7 @@ impl AppState {
|
||||
FaceClient::new(None), // disabled in test
|
||||
ClipClient::new(None), // disabled in test
|
||||
Arc::new(Mutex::new(Box::new(SqliteInsightGenerationJobDao::new()))), // placeholder for test
|
||||
Arc::new(Mutex::new(HashMap::new())), // placeholder for test
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user