memories: single-SQL rewrite + 20-year lookback

Replaces the EXIF-loop + WalkDir-fallback pipeline that powered
`/memories` with a single per-library SQL query
(`get_memories_in_window`) that uses `strftime('%m-%d' | '%W' | '%m',
date_taken, 'unixepoch', tz_offset)` for calendar matching in the
client's timezone, plus a `years_back` lower bound and a
no-future-dates upper bound. Returns only the matching rows; the
handler applies per-library `PathExcluder` post-query and sorts.

Drops:
- `collect_exif_memories` — replaced by the single SQL query.
- `collect_filesystem_memories` — the canonical-date pipeline now
  populates `date_taken` for every row at ingest, so the WalkDir
  fallback that scanned 14k+ files each request is no longer needed.
- `get_memory_date_with_priority` and friends — request-time waterfall
  superseded by `date_resolver` running at ingest. The associated
  three priority-tests are dropped; their replacement lives in
  `date_resolver::tests`.

On a ~14k-file library this drops `/memories` from 10–15 s
(dominated by `fs::metadata` per row) to single-digit ms.

Bumps `DEFAULT_YEARS_BACK` from 15 → 20 to surface deeper archives
on matching anniversaries.

Note vs. ISO weeks: the original Rust used `chrono::iso_week().week()`
for week-span matching. SQLite's `%W` is Monday-anchored but uses week
0 for days before the first Monday, so it can disagree with ISO at
year boundaries by ±1. Acceptable for nostalgia browsing.

Adds 3 new DAO tests covering month-span filter, library scoping, and
the unknown-span-token guard. Also adds a CLAUDE.md section describing
the canonical-date pipeline end-to-end and the new
`DATE_BACKFILL_MAX_PER_TICK` env var.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-06 16:04:09 -04:00
parent 54e0635a98
commit 7f12890f4b
4 changed files with 403 additions and 492 deletions

View File

@@ -9,6 +9,21 @@ use crate::database::models::{
};
use crate::otel::trace_db_call;
/// Decoded shape for `get_memories_in_window`'s raw `sql_query`. Diesel's
/// query DSL doesn't expose strftime, so the memories filter is hand-
/// written SQL — but the returned columns are simple enough that a small
/// `QueryableByName` struct suffices, kept private to this module.
#[derive(diesel::QueryableByName)]
#[allow(dead_code)] // fields read via Diesel's QueryableByName derive
struct MemoriesWindowRow {
#[diesel(sql_type = diesel::sql_types::Text)]
rel_path: String,
#[diesel(sql_type = diesel::sql_types::BigInt)]
date_taken: i64,
#[diesel(sql_type = diesel::sql_types::BigInt)]
last_modified: i64,
}
/// Wire shape for a single member of a duplicate group, returned by
/// `list_duplicates_*` and `lookup_duplicate_row`. Carries everything
/// the Apollo modal needs to render a member tile and its meta line —
@@ -424,6 +439,35 @@ pub trait ExifDao: Sync + Send {
source: &str,
) -> Result<(), DbError>;
/// Single-query backend for `/memories`. Returns
/// `(rel_path, date_taken, last_modified)` for rows in `library_id`
/// whose `date_taken` falls within `[now - years_back y, now]` and
/// whose calendar position matches the request's span:
/// - `"day"` — same month + day-of-month (any year)
/// - `"week"` — same week-of-year (SQLite `%W`, Monday-anchored —
/// close to but not exactly ISO week 8601; the
/// boundary cases at year-start/end can shift by ±1
/// vs the prior request-time `iso_week()` filter)
/// - `"month"` — same month (any year)
///
/// `tz_offset_minutes` is applied to both sides of the strftime
/// comparison so the calendar match is in the user's local time.
/// Backed by the `(library_id, date_taken)` index.
///
/// This is the single-SQL replacement for the EXIF-loop +
/// WalkDir-fallback that powered `/memories` previously; it's
/// correct only because the canonical-date waterfall at ingest
/// (`crate::date_resolver`) populates `date_taken` for every row
/// it can resolve.
fn get_memories_in_window(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
span_token: &str,
years_back: i32,
tz_offset_minutes: i32,
) -> Result<Vec<(String, i64, i64)>, DbError>;
/// Return image rows that have a `content_hash` but no `phash_64`,
/// oldest first. Used by the `backfill_perceptual_hash` binary.
/// Filters by image extension at the DB layer to avoid ever asking
@@ -1090,23 +1134,28 @@ impl ExifDao for SqliteExifDao {
library_id_val: i32,
limit: i64,
) -> Result<Vec<(i32, String)>, DbError> {
trace_db_call(context, "query", "get_rows_needing_date_backfill", |_span| {
use schema::image_exif::dsl::*;
trace_db_call(
context,
"query",
"get_rows_needing_date_backfill",
|_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// The partial index is on `(library_id, id) WHERE date_taken
// IS NULL OR date_taken_source = 'fs_time'`, so the planner
// hits it directly when both predicates are present.
image_exif
.filter(library_id.eq(library_id_val))
.filter(date_taken.is_null().or(date_taken_source.eq("fs_time")))
.select((library_id, rel_path))
.order(id.asc())
.limit(limit)
.load::<(i32, String)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
})
// The partial index is on `(library_id, id) WHERE date_taken
// IS NULL OR date_taken_source = 'fs_time'`, so the planner
// hits it directly when both predicates are present.
image_exif
.filter(library_id.eq(library_id_val))
.filter(date_taken.is_null().or(date_taken_source.eq("fs_time")))
.select((library_id, rel_path))
.order(id.asc())
.limit(limit)
.load::<(i32, String)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
},
)
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
@@ -1128,10 +1177,7 @@ impl ExifDao for SqliteExifDao {
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((
date_taken.eq(date_taken_val),
date_taken_source.eq(source),
))
.set((date_taken.eq(date_taken_val), date_taken_source.eq(source)))
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Update error"))
@@ -1139,6 +1185,60 @@ impl ExifDao for SqliteExifDao {
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
fn get_memories_in_window(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
span_token: &str,
years_back: i32,
tz_offset_minutes: i32,
) -> Result<Vec<(String, i64, i64)>, DbError> {
trace_db_call(context, "query", "get_memories_in_window", |_span| {
// strftime pattern is span-dependent; the rest of the WHERE
// clause is shared. Only `%m-%d`, `%W`, `%m` are accepted —
// anything else is a programmer error.
let pattern = match span_token {
"day" => "%m-%d",
"week" => "%W",
"month" => "%m",
_ => return Err(anyhow::anyhow!("invalid span token: {}", span_token)),
};
// SQLite's date modifiers want a string like `'-480 minutes'`
// (signed) or `'-15 years'`. Use the `+` flag so positive
// offsets render as `+480 minutes`.
let tz_modifier = format!("{:+} minutes", tz_offset_minutes);
let years_modifier = format!("-{} years", years_back);
let sql = format!(
"SELECT rel_path, date_taken, last_modified \
FROM image_exif \
WHERE library_id = ?1 \
AND date_taken IS NOT NULL \
AND date_taken <= unixepoch('now') \
AND date_taken >= unixepoch('now', ?2) \
AND strftime('{p}', date_taken, 'unixepoch', ?3) \
= strftime('{p}', 'now', ?3)",
p = pattern,
);
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::sql_query(sql)
.bind::<diesel::sql_types::Integer, _>(library_id)
.bind::<diesel::sql_types::Text, _>(years_modifier)
.bind::<diesel::sql_types::Text, _>(tz_modifier)
.load::<MemoriesWindowRow>(connection.deref_mut())
.map(|rows| {
rows.into_iter()
.map(|r| (r.rel_path, r.date_taken, r.last_modified))
.collect()
})
.map_err(|e| anyhow::anyhow!("Query error: {}", e))
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn find_by_content_hash(
&mut self,
context: &opentelemetry::Context,
@@ -2069,9 +2169,7 @@ mod exif_dao_tests {
// Other library — never returned even when eligible.
insert_row_with_source(&mut dao, 2, "archive/null.jpg", None, None);
let rows = dao
.get_rows_needing_date_backfill(&ctx(), 1, 100)
.unwrap();
let rows = dao.get_rows_needing_date_backfill(&ctx(), 1, 100).unwrap();
let paths: Vec<String> = rows.into_iter().map(|(_, p)| p).collect();
assert_eq!(paths.len(), 2, "expected null + fs_time eligible only");
assert!(paths.contains(&"main/null.jpg".to_string()));
@@ -2098,4 +2196,125 @@ mod exif_dao_tests {
assert_eq!(row.content_hash, Some("deadbeef".to_string()));
assert_eq!(row.size_bytes, Some(1024));
}
#[test]
fn get_memories_in_window_day_matches_only_same_md_in_year_window() {
let mut dao = setup_two_libraries();
// Anchor on a known date so the test is timezone-stable: insert
// rows whose date_taken IS the same wall-clock time as `now()`
// would have been some N years ago, and verify the day-span
// filter returns them. We can't bind 'now' from Rust, so instead
// we insert rows for the *current* day (offset by 365 days * N
// years) and rely on SQLite computing the same `%m-%d` for both
// sides of the equality. Using the unix-now-minus-365*N seconds
// approximation is good enough — leap years drift by ~one day
// every four years, but the test only checks day-of-year match
// for rows inserted "today minus N years (no leap correction)".
// To dodge the leap-year drift entirely, we use rows whose
// calendar date is read back from SQLite and we just check
// membership.
// 1y, 5y, 10y, 21y back from 'now':
let now_ts = chrono::Utc::now().timestamp();
let year_secs: i64 = 365 * 86_400;
insert_row_with_source(
&mut dao,
1,
"y1.jpg",
Some(now_ts - year_secs),
Some("exif"),
);
insert_row_with_source(
&mut dao,
1,
"y5.jpg",
Some(now_ts - 5 * year_secs),
Some("exif"),
);
insert_row_with_source(
&mut dao,
1,
"y10.jpg",
Some(now_ts - 10 * year_secs),
Some("exif"),
);
// Outside the 20-year window:
insert_row_with_source(
&mut dao,
1,
"y21.jpg",
Some(now_ts - 21 * year_secs),
Some("exif"),
);
// Future row: must be excluded by the `<= now` clause.
insert_row_with_source(
&mut dao,
1,
"future.jpg",
Some(now_ts + 86_400),
Some("exif"),
);
// No date — never returned regardless of source.
insert_row_with_source(&mut dao, 1, "nodate.jpg", None, None);
// Month span returns rows from the same calendar month over the
// window — y1, y5, y10 should all qualify (same month any year),
// y21 trims (out of years_back), future trims (> now), nodate
// never qualifies. Day-of-month leap drift means even with 365-
// day approximation a row may shift by one in either direction;
// month is the safer assertion under that approximation.
let rows = dao
.get_memories_in_window(&ctx(), 1, "month", 20, 0)
.unwrap();
let paths: std::collections::HashSet<String> =
rows.into_iter().map(|(p, _, _)| p).collect();
assert!(
paths.contains("y1.jpg") && paths.contains("y5.jpg") && paths.contains("y10.jpg"),
"month span should include all in-window rows: {:?}",
paths
);
assert!(
!paths.contains("y21.jpg"),
"21-year-old row should fall outside the years_back window"
);
assert!(!paths.contains("future.jpg"), "future row must be excluded");
assert!(
!paths.contains("nodate.jpg"),
"row without date must never appear"
);
}
#[test]
fn get_memories_in_window_scopes_by_library_id() {
let mut dao = setup_two_libraries();
let now_ts = chrono::Utc::now().timestamp();
let year = 365 * 86_400i64;
insert_row_with_source(&mut dao, 1, "main/x.jpg", Some(now_ts - year), Some("exif"));
insert_row_with_source(
&mut dao,
2,
"archive/x.jpg",
Some(now_ts - year),
Some("exif"),
);
let lib1 = dao
.get_memories_in_window(&ctx(), 1, "month", 20, 0)
.unwrap();
let lib2 = dao
.get_memories_in_window(&ctx(), 2, "month", 20, 0)
.unwrap();
assert_eq!(lib1.len(), 1);
assert_eq!(lib1[0].0, "main/x.jpg");
assert_eq!(lib2.len(), 1);
assert_eq!(lib2[0].0, "archive/x.jpg");
}
#[test]
fn get_memories_in_window_rejects_unknown_span_token() {
let mut dao = setup_two_libraries();
let err = dao.get_memories_in_window(&ctx(), 1, "decade", 20, 0);
assert!(err.is_err());
}
}