sqlite: enable WAL + busy_timeout in connect(); 408/413/429 transient
The DB connection helper now sets `journal_mode=WAL`, `busy_timeout=5000`, and `synchronous=NORMAL` on every connection. 13+ DAOs each open their own connection through this helper and share one SQLite file — without WAL, a writer's exclusive lock blocks readers and `load_persons` racing the face-watch write storm errored instantly with "database is locked". GPU face inference made this visible by speeding detect ~10× and flooding the writer side. WAL persists in the file once set so the debug binaries that bypass connect() inherit it automatically. Also widen face_client.rs's classifier: 408 / 413 / 429 are now Transient instead of Permanent. These are operator-fixable proxy/infra errors; marking them Permanent poisons every affected photo with status='failed' and requires manual SQL to recover. Specifically, Apollo's nginx defaulted to a 1 MB body cap and silently rejected normal-size photos before they reached the backend — the deferred-and-retry contract is the right behavior for that class of fault. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -298,6 +298,20 @@ fn classify_error_response(status: u16, body_text: &str) -> FaceDetectError {
|
|||||||
body_text
|
body_text
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
// Infra-level 4xx that an operator can fix without re-encoding the
|
||||||
|
// bytes: 408 (proxy timeout), 413 (request too large — reverse-proxy
|
||||||
|
// body cap), 429 (rate limit). Treating these as Permanent poisons
|
||||||
|
// every photo that hit the misconfig with `status='failed'` and
|
||||||
|
// requires a manual DELETE to recover. Defer instead so the next
|
||||||
|
// scan tick retries naturally once the proxy is fixed.
|
||||||
|
if matches!(status, 408 | 413 | 429) {
|
||||||
|
return FaceDetectError::Transient(anyhow::anyhow!(
|
||||||
|
"face detect {} {}: {}",
|
||||||
|
status,
|
||||||
|
detail_code,
|
||||||
|
body_text
|
||||||
|
));
|
||||||
|
}
|
||||||
// Any other 4xx: be conservative and treat as Permanent so we don't
|
// Any other 4xx: be conservative and treat as Permanent so we don't
|
||||||
// loop forever on a stable rejection. Any other 5xx: Transient —
|
// loop forever on a stable rejection. Any other 5xx: Transient —
|
||||||
// likely intermittent.
|
// likely intermittent.
|
||||||
@@ -360,6 +374,19 @@ mod tests {
|
|||||||
assert!(is_permanent(&classify_error_response(404, "{}")));
|
assert!(is_permanent(&classify_error_response(404, "{}")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_infra_4xx_is_transient() {
|
||||||
|
// 408 / 413 / 429 are operator-fixable proxy/infra errors.
|
||||||
|
// Marking them Permanent poisons every affected photo with
|
||||||
|
// status='failed' and requires manual SQL to recover. The
|
||||||
|
// 413 path specifically bit us when nginx defaulted to a 1 MB
|
||||||
|
// body cap and rejected normal-size photos before they reached
|
||||||
|
// the backend.
|
||||||
|
assert!(is_transient(&classify_error_response(408, "")));
|
||||||
|
assert!(is_transient(&classify_error_response(413, "<html>nginx</html>")));
|
||||||
|
assert!(is_transient(&classify_error_response(429, "{}")));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn classify_handles_unparseable_body() {
|
fn classify_handles_unparseable_body() {
|
||||||
// Apollo can return non-JSON on misroute / proxy errors; the
|
// Apollo can return non-JSON on misroute / proxy errors; the
|
||||||
|
|||||||
@@ -125,7 +125,24 @@ impl UserDao for SqliteUserDao {
|
|||||||
|
|
||||||
pub fn connect() -> SqliteConnection {
|
pub fn connect() -> SqliteConnection {
|
||||||
let db_url = dotenv::var("DATABASE_URL").expect("DATABASE_URL must be set");
|
let db_url = dotenv::var("DATABASE_URL").expect("DATABASE_URL must be set");
|
||||||
SqliteConnection::establish(&db_url).expect("Error connecting to DB")
|
let mut conn = SqliteConnection::establish(&db_url).expect("Error connecting to DB");
|
||||||
|
// Each DAO opens its own connection (13+ across the app) and they all
|
||||||
|
// share one DB file. Without WAL, a writer holds an exclusive lock
|
||||||
|
// that blocks readers — `load_persons` racing the face-watch write
|
||||||
|
// storm errors instantly with `database is locked`. WAL lets readers
|
||||||
|
// and one writer coexist; busy_timeout makes any remaining
|
||||||
|
// writer-vs-writer contention wait instead of failing fast.
|
||||||
|
// synchronous=NORMAL is the standard WAL pairing (FULL is for
|
||||||
|
// rollback-journal durability; we accept the narrow last-fsync
|
||||||
|
// window for the 2–10× write throughput).
|
||||||
|
use diesel::connection::SimpleConnection;
|
||||||
|
conn.batch_execute(
|
||||||
|
"PRAGMA journal_mode = WAL; \
|
||||||
|
PRAGMA busy_timeout = 5000; \
|
||||||
|
PRAGMA synchronous = NORMAL;",
|
||||||
|
)
|
||||||
|
.expect("set sqlite pragmas");
|
||||||
|
conn
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
|||||||
Reference in New Issue
Block a user