sqlite: enable WAL + busy_timeout in connect(); 408/413/429 transient
The DB connection helper now sets `journal_mode=WAL`, `busy_timeout=5000`, and `synchronous=NORMAL` on every connection. 13+ DAOs each open their own connection through this helper and share one SQLite file — without WAL, a writer's exclusive lock blocks readers and `load_persons` racing the face-watch write storm errored instantly with "database is locked". GPU face inference made this visible by speeding detect ~10× and flooding the writer side. WAL persists in the file once set so the debug binaries that bypass connect() inherit it automatically. Also widen face_client.rs's classifier: 408 / 413 / 429 are now Transient instead of Permanent. These are operator-fixable proxy/infra errors; marking them Permanent poisons every affected photo with status='failed' and requires manual SQL to recover. Specifically, Apollo's nginx defaulted to a 1 MB body cap and silently rejected normal-size photos before they reached the backend — the deferred-and-retry contract is the right behavior for that class of fault. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -298,6 +298,20 @@ fn classify_error_response(status: u16, body_text: &str) -> FaceDetectError {
|
||||
body_text
|
||||
));
|
||||
}
|
||||
// Infra-level 4xx that an operator can fix without re-encoding the
|
||||
// bytes: 408 (proxy timeout), 413 (request too large — reverse-proxy
|
||||
// body cap), 429 (rate limit). Treating these as Permanent poisons
|
||||
// every photo that hit the misconfig with `status='failed'` and
|
||||
// requires a manual DELETE to recover. Defer instead so the next
|
||||
// scan tick retries naturally once the proxy is fixed.
|
||||
if matches!(status, 408 | 413 | 429) {
|
||||
return FaceDetectError::Transient(anyhow::anyhow!(
|
||||
"face detect {} {}: {}",
|
||||
status,
|
||||
detail_code,
|
||||
body_text
|
||||
));
|
||||
}
|
||||
// Any other 4xx: be conservative and treat as Permanent so we don't
|
||||
// loop forever on a stable rejection. Any other 5xx: Transient —
|
||||
// likely intermittent.
|
||||
@@ -360,6 +374,19 @@ mod tests {
|
||||
assert!(is_permanent(&classify_error_response(404, "{}")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_infra_4xx_is_transient() {
|
||||
// 408 / 413 / 429 are operator-fixable proxy/infra errors.
|
||||
// Marking them Permanent poisons every affected photo with
|
||||
// status='failed' and requires manual SQL to recover. The
|
||||
// 413 path specifically bit us when nginx defaulted to a 1 MB
|
||||
// body cap and rejected normal-size photos before they reached
|
||||
// the backend.
|
||||
assert!(is_transient(&classify_error_response(408, "")));
|
||||
assert!(is_transient(&classify_error_response(413, "<html>nginx</html>")));
|
||||
assert!(is_transient(&classify_error_response(429, "{}")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_handles_unparseable_body() {
|
||||
// Apollo can return non-JSON on misroute / proxy errors; the
|
||||
|
||||
@@ -125,7 +125,24 @@ impl UserDao for SqliteUserDao {
|
||||
|
||||
pub fn connect() -> SqliteConnection {
|
||||
let db_url = dotenv::var("DATABASE_URL").expect("DATABASE_URL must be set");
|
||||
SqliteConnection::establish(&db_url).expect("Error connecting to DB")
|
||||
let mut conn = SqliteConnection::establish(&db_url).expect("Error connecting to DB");
|
||||
// Each DAO opens its own connection (13+ across the app) and they all
|
||||
// share one DB file. Without WAL, a writer holds an exclusive lock
|
||||
// that blocks readers — `load_persons` racing the face-watch write
|
||||
// storm errors instantly with `database is locked`. WAL lets readers
|
||||
// and one writer coexist; busy_timeout makes any remaining
|
||||
// writer-vs-writer contention wait instead of failing fast.
|
||||
// synchronous=NORMAL is the standard WAL pairing (FULL is for
|
||||
// rollback-journal durability; we accept the narrow last-fsync
|
||||
// window for the 2–10× write throughput).
|
||||
use diesel::connection::SimpleConnection;
|
||||
conn.batch_execute(
|
||||
"PRAGMA journal_mode = WAL; \
|
||||
PRAGMA busy_timeout = 5000; \
|
||||
PRAGMA synchronous = NORMAL;",
|
||||
)
|
||||
.expect("set sqlite pragmas");
|
||||
conn
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
Reference in New Issue
Block a user