From db9dc63e5e40cd94f9d5669deb87d38693304552 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Thu, 30 Apr 2026 18:10:59 +0000 Subject: [PATCH] sqlite: enable WAL + busy_timeout in connect(); 408/413/429 transient MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DB connection helper now sets `journal_mode=WAL`, `busy_timeout=5000`, and `synchronous=NORMAL` on every connection. 13+ DAOs each open their own connection through this helper and share one SQLite file — without WAL, a writer's exclusive lock blocks readers and `load_persons` racing the face-watch write storm errored instantly with "database is locked". GPU face inference made this visible by speeding detect ~10× and flooding the writer side. WAL persists in the file once set so the debug binaries that bypass connect() inherit it automatically. Also widen face_client.rs's classifier: 408 / 413 / 429 are now Transient instead of Permanent. These are operator-fixable proxy/infra errors; marking them Permanent poisons every affected photo with status='failed' and requires manual SQL to recover. Specifically, Apollo's nginx defaulted to a 1 MB body cap and silently rejected normal-size photos before they reached the backend — the deferred-and-retry contract is the right behavior for that class of fault. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ai/face_client.rs | 27 +++++++++++++++++++++++++++ src/database/mod.rs | 19 ++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/ai/face_client.rs b/src/ai/face_client.rs index 8a52812..e76d9ee 100644 --- a/src/ai/face_client.rs +++ b/src/ai/face_client.rs @@ -298,6 +298,20 @@ fn classify_error_response(status: u16, body_text: &str) -> FaceDetectError { body_text )); } + // Infra-level 4xx that an operator can fix without re-encoding the + // bytes: 408 (proxy timeout), 413 (request too large — reverse-proxy + // body cap), 429 (rate limit). Treating these as Permanent poisons + // every photo that hit the misconfig with `status='failed'` and + // requires a manual DELETE to recover. Defer instead so the next + // scan tick retries naturally once the proxy is fixed. + if matches!(status, 408 | 413 | 429) { + return FaceDetectError::Transient(anyhow::anyhow!( + "face detect {} {}: {}", + status, + detail_code, + body_text + )); + } // Any other 4xx: be conservative and treat as Permanent so we don't // loop forever on a stable rejection. Any other 5xx: Transient — // likely intermittent. @@ -360,6 +374,19 @@ mod tests { assert!(is_permanent(&classify_error_response(404, "{}"))); } + #[test] + fn classify_infra_4xx_is_transient() { + // 408 / 413 / 429 are operator-fixable proxy/infra errors. + // Marking them Permanent poisons every affected photo with + // status='failed' and requires manual SQL to recover. The + // 413 path specifically bit us when nginx defaulted to a 1 MB + // body cap and rejected normal-size photos before they reached + // the backend. + assert!(is_transient(&classify_error_response(408, ""))); + assert!(is_transient(&classify_error_response(413, "nginx"))); + assert!(is_transient(&classify_error_response(429, "{}"))); + } + #[test] fn classify_handles_unparseable_body() { // Apollo can return non-JSON on misroute / proxy errors; the diff --git a/src/database/mod.rs b/src/database/mod.rs index a4e348a..0ab74a1 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -125,7 +125,24 @@ impl UserDao for SqliteUserDao { pub fn connect() -> SqliteConnection { let db_url = dotenv::var("DATABASE_URL").expect("DATABASE_URL must be set"); - SqliteConnection::establish(&db_url).expect("Error connecting to DB") + let mut conn = SqliteConnection::establish(&db_url).expect("Error connecting to DB"); + // Each DAO opens its own connection (13+ across the app) and they all + // share one DB file. Without WAL, a writer holds an exclusive lock + // that blocks readers — `load_persons` racing the face-watch write + // storm errors instantly with `database is locked`. WAL lets readers + // and one writer coexist; busy_timeout makes any remaining + // writer-vs-writer contention wait instead of failing fast. + // synchronous=NORMAL is the standard WAL pairing (FULL is for + // rollback-journal durability; we accept the narrow last-fsync + // window for the 2–10× write throughput). + use diesel::connection::SimpleConnection; + conn.batch_execute( + "PRAGMA journal_mode = WAL; \ + PRAGMA busy_timeout = 5000; \ + PRAGMA synchronous = NORMAL;", + ) + .expect("set sqlite pragmas"); + conn } #[derive(Debug)]