From 84326501a9ff3cbd7cf0424b0ef4ee7998ed4581 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 6 May 2026 15:57:49 -0400 Subject: [PATCH 1/6] image_exif: add date_taken_source column MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New nullable TEXT column tracks which step of the canonical-date waterfall (kamadak-exif → exiftool → filename → fs_time) populated `date_taken`. Lets a later per-tick drain re-resolve weak sources (`fs_time`) once stronger ones become available, and gives the UI/debug surface a way to answer "why does this photo show up under this date?". Adds the column at all `InsertImageExif` construction sites with `None` placeholders (the resolver wiring lands in a follow-up commit), and extends the `update_exif` SET tuple so the column survives the GPS-write re-read path. Partial index `idx_image_exif_date_backfill` is created for the upcoming drain query. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../down.sql | 2 ++ .../up.sql | 24 +++++++++++++++++++ src/database/mod.rs | 2 ++ src/database/models.rs | 8 +++++++ src/database/schema.rs | 1 + src/files.rs | 2 ++ src/main.rs | 6 +++++ 7 files changed, 45 insertions(+) create mode 100644 migrations/2026-05-06-000000_add_date_taken_source/down.sql create mode 100644 migrations/2026-05-06-000000_add_date_taken_source/up.sql diff --git a/migrations/2026-05-06-000000_add_date_taken_source/down.sql b/migrations/2026-05-06-000000_add_date_taken_source/down.sql new file mode 100644 index 0000000..212eddf --- /dev/null +++ b/migrations/2026-05-06-000000_add_date_taken_source/down.sql @@ -0,0 +1,2 @@ +DROP INDEX IF EXISTS idx_image_exif_date_backfill; +ALTER TABLE image_exif DROP COLUMN date_taken_source; diff --git a/migrations/2026-05-06-000000_add_date_taken_source/up.sql b/migrations/2026-05-06-000000_add_date_taken_source/up.sql new file mode 100644 index 0000000..2be2590 --- /dev/null +++ b/migrations/2026-05-06-000000_add_date_taken_source/up.sql @@ -0,0 +1,24 @@ +-- Tracks where a row's `date_taken` was sourced so the canonical-date +-- waterfall (kamadak-exif → exiftool → filename → earliest_fs_time) is +-- visible to debugging and to the per-tick backfill drain that re-runs +-- weak sources once stronger ones become available (e.g. exiftool gets +-- installed on a deploy that didn't have it). See CLAUDE.md → Memories +-- canonical-date pipeline. +-- +-- Values: +-- 'exif' — kamadak-exif read DateTime/DateTimeOriginal directly +-- 'exiftool' — exiftool fallback caught a video / MakerNote / QuickTime tag +-- 'filename' — extract_date_from_filename matched a known pattern +-- 'fs_time' — fell through to earliest_fs_time(metadata) +-- +-- NULL when `date_taken` itself is NULL (no source resolved the date). +ALTER TABLE image_exif ADD COLUMN date_taken_source TEXT; + +-- Partial index for the per-tick backfill drain: targets rows that need +-- re-resolution (no date yet, or only the weakest source resolved it). +-- Filename-sourced rows are intentionally excluded — the regex is +-- authoritative when it matches and re-running exiftool wouldn't change +-- the answer. +CREATE INDEX idx_image_exif_date_backfill + ON image_exif (library_id, id) + WHERE date_taken IS NULL OR date_taken_source = 'fs_time'; diff --git a/src/database/mod.rs b/src/database/mod.rs index 509315d..9ee2a16 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -730,6 +730,7 @@ impl ExifDao for SqliteExifDao { shutter_speed.eq(&exif_data.shutter_speed), iso.eq(&exif_data.iso), date_taken.eq(&exif_data.date_taken), + date_taken_source.eq(&exif_data.date_taken_source), last_modified.eq(&exif_data.last_modified), )) .execute(connection.deref_mut()) @@ -1819,6 +1820,7 @@ mod exif_dao_tests { size_bytes: None, phash_64: None, dhash_64: None, + date_taken_source: None, }, ) .expect("insert exif row"); diff --git a/src/database/models.rs b/src/database/models.rs index 9d1a3b8..1e3139d 100644 --- a/src/database/models.rs +++ b/src/database/models.rs @@ -63,6 +63,12 @@ pub struct InsertImageExif { pub phash_64: Option, /// 64-bit dHash (gradient). NULL for videos and decode failures. pub dhash_64: Option, + /// Which step of the canonical-date waterfall populated `date_taken`: + /// `"exif"` | `"exiftool"` | `"filename"` | `"fs_time"`. NULL when + /// `date_taken` is NULL (no source resolved it). The per-tick backfill + /// drain re-resolves rows whose source is `"fs_time"` once exiftool + /// has had a chance to run. + pub date_taken_source: Option, } // Field order matches the post-migration column order in `image_exif`. @@ -98,6 +104,8 @@ pub struct ImageExif { pub duplicate_of_hash: Option, /// Unix seconds at which the resolve was committed. pub duplicate_decided_at: Option, + /// Which step of the canonical-date waterfall populated `date_taken`. + pub date_taken_source: Option, } #[derive(Insertable)] diff --git a/src/database/schema.rs b/src/database/schema.rs index bbd0a8d..9a9958a 100644 --- a/src/database/schema.rs +++ b/src/database/schema.rs @@ -125,6 +125,7 @@ diesel::table! { dhash_64 -> Nullable, duplicate_of_hash -> Nullable, duplicate_decided_at -> Nullable, + date_taken_source -> Nullable, } } diff --git a/src/files.rs b/src/files.rs index 9f01624..10d8be4 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1508,6 +1508,7 @@ mod tests { dhash_64: data.dhash_64, duplicate_of_hash: None, duplicate_decided_at: None, + date_taken_source: data.date_taken_source.clone(), }) } @@ -1551,6 +1552,7 @@ mod tests { dhash_64: data.dhash_64, duplicate_of_hash: None, duplicate_decided_at: None, + date_taken_source: data.date_taken_source.clone(), }) } diff --git a/src/main.rs b/src/main.rs index 2d598ca..3c0a9a6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -537,6 +537,8 @@ async fn set_image_gps( // with a usable signal; failure just leaves prior values in place. phash_64: perceptual_hash::compute(&full_path).map(|h| h.phash_64), dhash_64: perceptual_hash::compute(&full_path).map(|h| h.dhash_64), + // Replaced in a follow-up commit with the canonical-date resolver's output. + date_taken_source: None, }; let updated = { @@ -772,6 +774,8 @@ async fn upload_image( size_bytes, phash_64: perceptual.map(|h| h.phash_64), dhash_64: perceptual.map(|h| h.dhash_64), + // Replaced in a follow-up commit with the canonical-date resolver's output. + date_taken_source: None, }; if let Ok(mut dao) = exif_dao.lock() { @@ -2410,6 +2414,8 @@ fn process_new_files( size_bytes, phash_64: perceptual.map(|h| h.phash_64), dhash_64: perceptual.map(|h| h.dhash_64), + // Replaced in a follow-up commit with the canonical-date resolver's output. + date_taken_source: None, }; let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); From 79e258eccd9267e62f2ac99de75703bb18fe5a09 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 6 May 2026 15:59:02 -0400 Subject: [PATCH 2/6] date_resolver: canonical date_taken waterfall with exiftool fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New module that consolidates the four-step ingest waterfall: kamadak-exif (already in process via the caller's prior result) → exiftool fallback → filename regex → earliest_fs_time. Each step is tagged with a `DateSource` so the caller can persist provenance. The exiftool fallback is what makes videos and MakerNote-hosted dates land at all — kamadak-exif can't read QuickTime/MP4 or Nikon-style sub-IFDs. Single-file mode shells out per call; batch mode pipes paths on stdin via `-@ -` and fans the result through one subprocess so the upcoming per-tick drain doesn't pay startup cost per row. The `exiftool` PATH check is cached in a `OnceLock` to keep the drain short-circuited on deploys without exiftool installed. `SubSecDateTimeOriginal` and `ContentCreateDate` are pulled alongside the standard tags to capture iPhone's sub-second precision and Apple's preferred capture-time tag respectively. `FileModifyDate` is deliberately *not* in the tag list — it's a filesystem-derived value the resolver already covers via the `fs_time` step, and pulling it through exiftool would mask "no real EXIF date" with a misleading `source = exiftool` row. Module is registered in both `lib.rs` and `main.rs` (sibling-module pattern the rest of the bin uses); no callers wired in yet — that lands in the next commit. Comes with 9 unit tests covering JSON parsing edge cases, source-priority short-circuiting, and the fs_time-when-no-exif path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/date_resolver.rs | 495 +++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/main.rs | 1 + 3 files changed, 497 insertions(+) create mode 100644 src/date_resolver.rs diff --git a/src/date_resolver.rs b/src/date_resolver.rs new file mode 100644 index 0000000..8498cb8 --- /dev/null +++ b/src/date_resolver.rs @@ -0,0 +1,495 @@ +//! Canonical `date_taken` resolution for ingest and the per-tick backfill +//! drain. +//! +//! The waterfall (in order; first hit wins): +//! +//! 1. **kamadak-exif** — fast in-process EXIF read. Already done by +//! `exif::extract_exif_from_path` for image-bearing formats; callers +//! pass that result in via `prior_exif_date` so we don't re-parse. +//! 2. **exiftool** — shell-out fallback that reaches places kamadak-exif +//! can't: QuickTime/MP4 (`MediaCreateDate`, `TrackCreateDate`, +//! `CreateDate`), Apple's `ContentCreateDate`, MakerNote sub-IFDs. +//! Required for videos to land a real date; degrades silently when +//! `exiftool` isn't on PATH. +//! 3. **filename regex** — `memories::extract_date_from_filename` covers +//! common screenshot / chat-export / timestamp-named patterns. +//! 4. **earliest filesystem time** — `utils::earliest_fs_time` picks the +//! earlier of created / modified, which on copied-from-backup files is +//! a better proxy for content age than either alone. +//! +//! `DateSource` records which step won so the per-tick drain can re-resolve +//! weak sources (`fs_time`) once exiftool becomes available, and so the +//! UI/debug surface can answer "why does this photo show up under this +//! date." Note that the previous `/memories` request-time logic preferred +//! filename even when EXIF was present; this resolver inverts that — EXIF +//! is authoritative when it exists, on the theory that an EXIF +//! `DateTimeOriginal` is more reliable than a filename pattern that may +//! reflect import time rather than capture time. + +use std::collections::HashMap; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::sync::OnceLock; + +use chrono::{DateTime, Utc}; +use log::{debug, trace, warn}; +use serde::Deserialize; + +use crate::utils::earliest_fs_time; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum DateSource { + /// kamadak-exif read DateTime/DateTimeOriginal directly. + Exif, + /// exiftool fallback caught a video / MakerNote / QuickTime tag. + Exiftool, + /// `extract_date_from_filename` matched a known pattern. + Filename, + /// Fell through to `earliest_fs_time(metadata)`. + FsTime, +} + +impl DateSource { + pub fn as_str(self) -> &'static str { + match self { + DateSource::Exif => "exif", + DateSource::Exiftool => "exiftool", + DateSource::Filename => "filename", + DateSource::FsTime => "fs_time", + } + } +} + +#[derive(Copy, Clone, Debug)] +pub struct ResolvedDate { + pub timestamp: i64, + pub source: DateSource, +} + +/// Resolve the canonical date for a single file, given an already-extracted +/// kamadak-exif date if available. Returns `None` only if every step in the +/// waterfall fails — for files that exist on disk this should be vanishingly +/// rare (the fs-time fallback alone almost always succeeds). +pub fn resolve_date_taken(path: &Path, prior_exif_date: Option) -> Option { + if let Some(ts) = prior_exif_date { + return Some(ResolvedDate { + timestamp: ts, + source: DateSource::Exif, + }); + } + if let Some(ts) = exiftool_date_single(path) { + return Some(ResolvedDate { + timestamp: ts, + source: DateSource::Exiftool, + }); + } + if let Some(dt) = path + .file_name() + .and_then(|f| f.to_str()) + .and_then(crate::memories::extract_date_from_filename) + { + return Some(ResolvedDate { + timestamp: dt.timestamp(), + source: DateSource::Filename, + }); + } + if let Ok(meta) = std::fs::metadata(path) + && let Some(t) = earliest_fs_time(&meta) + { + let dt: DateTime = t.into(); + return Some(ResolvedDate { + timestamp: dt.timestamp(), + source: DateSource::FsTime, + }); + } + None +} + +/// Batch waterfall. exiftool runs once over the whole batch (single +/// subprocess); everything else is per-file and runs only on misses. +/// `prior_exif_dates` lets the caller pass in already-known kamadak dates +/// keyed by path; entries without a prior date fall through to exiftool +/// and the rest of the waterfall. +/// +/// The per-tick backfill drain is the primary caller — it loads ~500 rows +/// at a time and uses one exiftool subprocess to drain the lot. +pub fn resolve_dates_batch( + paths: &[PathBuf], + prior_exif_dates: &HashMap, +) -> HashMap { + let mut out: HashMap = HashMap::new(); + let mut needs_exiftool: Vec<&Path> = Vec::with_capacity(paths.len()); + + for path in paths { + if let Some(&ts) = prior_exif_dates.get(path) { + out.insert( + path.clone(), + ResolvedDate { + timestamp: ts, + source: DateSource::Exif, + }, + ); + } else { + needs_exiftool.push(path.as_path()); + } + } + + if !needs_exiftool.is_empty() { + let exiftool_results = exiftool_dates_batch(&needs_exiftool); + for path in &needs_exiftool { + if let Some(&ts) = exiftool_results.get(*path) { + out.insert( + path.to_path_buf(), + ResolvedDate { + timestamp: ts, + source: DateSource::Exiftool, + }, + ); + } + } + } + + for path in paths { + if out.contains_key(path) { + continue; + } + if let Some(dt) = path + .file_name() + .and_then(|f| f.to_str()) + .and_then(crate::memories::extract_date_from_filename) + { + out.insert( + path.clone(), + ResolvedDate { + timestamp: dt.timestamp(), + source: DateSource::Filename, + }, + ); + continue; + } + if let Ok(meta) = std::fs::metadata(path) + && let Some(t) = earliest_fs_time(&meta) + { + let dt: DateTime = t.into(); + out.insert( + path.clone(), + ResolvedDate { + timestamp: dt.timestamp(), + source: DateSource::FsTime, + }, + ); + } + } + + out +} + +/// Tag priority for exiftool extraction. First non-zero value wins. +/// +/// Photos: `DateTimeOriginal` (original capture) and `SubSecDateTimeOriginal` +/// are most authoritative. `CreateDate` is a common alias and a sane fallback. +/// +/// Videos: `MediaCreateDate` / `TrackCreateDate` are the QuickTime/MP4 +/// timestamps. `ContentCreateDate` is Apple's iOS-set tag; it often +/// reflects local capture time on iPhone exports better than the others. +/// +/// Notably absent: `FileModifyDate` / `FileAccessDate` — those are +/// filesystem-derived and the resolver covers them via the `fs_time` +/// fallback. Letting exiftool pull them here would mask "no real EXIF +/// date" with a `source = exiftool` row that's no better than fs_time. +const EXIFTOOL_DATE_TAGS: &[&str] = &[ + "DateTimeOriginal", + "SubSecDateTimeOriginal", + "CreateDate", + "MediaCreateDate", + "TrackCreateDate", + "ContentCreateDate", +]; + +/// Cache the "exiftool exists on PATH" check across the process lifetime so +/// the per-tick backfill doesn't fork a doomed subprocess every iteration on +/// deploys without exiftool installed. +fn exiftool_available() -> bool { + static AVAIL: OnceLock = OnceLock::new(); + *AVAIL.get_or_init(|| { + let ok = Command::new("exiftool") + .arg("-ver") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false); + if !ok { + warn!("exiftool not on PATH; date_taken waterfall skips that step"); + } + ok + }) +} + +/// One-file exiftool invocation. Used by the upload + GPS-write paths, +/// which deal with one file at a time. The batch path uses +/// `exiftool_dates_batch` so we don't pay subprocess startup per row. +fn exiftool_date_single(path: &Path) -> Option { + if !exiftool_available() { + return None; + } + let mut cmd = Command::new("exiftool"); + cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2"); + for tag in EXIFTOOL_DATE_TAGS { + cmd.arg(format!("-{}", tag)); + } + cmd.arg(path); + let output = cmd.output().ok()?; + if !output.status.success() { + trace!("exiftool exited non-zero for {:?}", path); + return None; + } + parse_exiftool_json(&output.stdout) + .into_iter() + .next() + .map(|(_, ts)| ts) +} + +/// Drain a batch via a single exiftool subprocess. Paths are fed on stdin +/// via `-@ -`, so the argv stays short regardless of batch size — safe for +/// libraries with very long path components. +fn exiftool_dates_batch(paths: &[&Path]) -> HashMap { + let mut out = HashMap::new(); + if paths.is_empty() || !exiftool_available() { + return out; + } + + let mut cmd = Command::new("exiftool"); + cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2"); + for tag in EXIFTOOL_DATE_TAGS { + cmd.arg(format!("-{}", tag)); + } + cmd.arg("-@").arg("-"); + cmd.stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()); + + let mut child = match cmd.spawn() { + Ok(c) => c, + Err(e) => { + warn!("exiftool batch spawn failed: {}", e); + return out; + } + }; + + if let Some(mut stdin) = child.stdin.take() { + for p in paths { + // exiftool's argfile reader treats each line as one path; OS + // path bytes don't always survive a String round-trip, but + // every path we get here originated from rel_path / root_path + // strings already, so to-string-lossy is a non-event. + if let Err(e) = writeln!(stdin, "{}", p.display()) { + warn!("exiftool batch stdin write failed: {}", e); + break; + } + } + } + + let output = match child.wait_with_output() { + Ok(o) => o, + Err(e) => { + warn!("exiftool batch wait failed: {}", e); + return out; + } + }; + if !output.status.success() { + debug!( + "exiftool batch exit status {:?}; partial output may still parse", + output.status.code() + ); + } + for (source, ts) in parse_exiftool_json(&output.stdout) { + out.insert(PathBuf::from(source), ts); + } + out +} + +/// One row per input file. exiftool emits any tag we asked for that was +/// present, plus the `SourceFile` it was reading. Tags are JSON values +/// because `-d %s` returns the timestamp as a *string* of digits, not a +/// number, when the date parses; absent tags are simply missing keys. +#[derive(Debug, Deserialize)] +struct ExiftoolEntry { + #[serde(rename = "SourceFile")] + source_file: String, + #[serde(rename = "DateTimeOriginal")] + date_time_original: Option, + #[serde(rename = "SubSecDateTimeOriginal")] + sub_sec_date_time_original: Option, + #[serde(rename = "CreateDate")] + create_date: Option, + #[serde(rename = "MediaCreateDate")] + media_create_date: Option, + #[serde(rename = "TrackCreateDate")] + track_create_date: Option, + #[serde(rename = "ContentCreateDate")] + content_create_date: Option, +} + +fn parse_exiftool_json(stdout: &[u8]) -> Vec<(String, i64)> { + let entries: Vec = match serde_json::from_slice(stdout) { + Ok(v) => v, + Err(e) => { + // Empty stdout on total failure isn't a parse error worth + // logging at warn — the caller already noted the non-zero + // exit status. + if !stdout.is_empty() { + warn!("exiftool JSON parse failed: {}", e); + } + return Vec::new(); + } + }; + + let mut out = Vec::with_capacity(entries.len()); + for entry in entries { + // Walk the priority list. exiftool sometimes returns the literal + // string "0000:00:00 00:00:00" for missing-but-allocated date + // slots; with `-d %s` that becomes the unix epoch (0). Reject + // anything <= 0 so we fall through to the next tag. + let tags = [ + entry.date_time_original.as_ref(), + entry.sub_sec_date_time_original.as_ref(), + entry.create_date.as_ref(), + entry.media_create_date.as_ref(), + entry.track_create_date.as_ref(), + entry.content_create_date.as_ref(), + ]; + let mut chosen: Option = None; + for tag in tags.iter().flatten() { + if let Some(ts) = coerce_to_unix_seconds(tag) + && ts > 0 + { + chosen = Some(ts); + break; + } + } + if let Some(ts) = chosen { + out.push((entry.source_file, ts)); + } + } + out +} + +/// `-d %s` should hand us a numeric string, but exiftool's JSON encoder +/// will emit a number when the tag was defined as numeric in its lib — +/// accept both shapes. +fn coerce_to_unix_seconds(v: &serde_json::Value) -> Option { + match v { + serde_json::Value::String(s) => s.trim().parse::().ok(), + serde_json::Value::Number(n) => n.as_i64(), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_exiftool_json_picks_first_priority_tag() { + let json = br#"[{ + "SourceFile": "/lib/IMG.jpg", + "DateTimeOriginal": "1500000000", + "CreateDate": "1400000000" + }]"#; + let parsed = parse_exiftool_json(json); + assert_eq!(parsed, vec![("/lib/IMG.jpg".to_string(), 1500000000)]); + } + + #[test] + fn parse_exiftool_json_falls_through_zeros() { + // exiftool emits "0000:00:00 00:00:00" → unix epoch 0 with -d %s. + // The resolver should skip those and pick the next tag. + let json = br#"[{ + "SourceFile": "/lib/clip.mov", + "DateTimeOriginal": "0", + "MediaCreateDate": "1500000000" + }]"#; + let parsed = parse_exiftool_json(json); + assert_eq!(parsed, vec![("/lib/clip.mov".to_string(), 1500000000)]); + } + + #[test] + fn parse_exiftool_json_accepts_numeric_values() { + let json = br#"[{ + "SourceFile": "/lib/a.jpg", + "CreateDate": 1234567890 + }]"#; + let parsed = parse_exiftool_json(json); + assert_eq!(parsed, vec![("/lib/a.jpg".to_string(), 1234567890)]); + } + + #[test] + fn parse_exiftool_json_emits_nothing_when_no_tag_present() { + let json = br#"[{"SourceFile": "/lib/no_dates.bin"}]"#; + let parsed = parse_exiftool_json(json); + assert!(parsed.is_empty()); + } + + #[test] + fn parse_exiftool_json_handles_multiple_entries() { + let json = br#"[ + {"SourceFile": "/lib/a.jpg", "DateTimeOriginal": "100"}, + {"SourceFile": "/lib/b.jpg", "CreateDate": "200"} + ]"#; + let parsed = parse_exiftool_json(json); + assert_eq!( + parsed, + vec![ + ("/lib/a.jpg".to_string(), 100), + ("/lib/b.jpg".to_string(), 200) + ] + ); + } + + #[test] + fn date_source_as_str_round_trip() { + for src in [ + DateSource::Exif, + DateSource::Exiftool, + DateSource::Filename, + DateSource::FsTime, + ] { + assert!(!src.as_str().is_empty()); + } + } + + #[test] + fn resolve_uses_prior_exif_when_present() { + // Path doesn't need to exist when prior_exif_date short-circuits. + let resolved = + resolve_date_taken(Path::new("/nonexistent/file.jpg"), Some(1700000000)).unwrap(); + assert_eq!(resolved.timestamp, 1700000000); + assert_eq!(resolved.source, DateSource::Exif); + } + + #[test] + fn resolve_filename_when_no_exif_and_file_missing() { + // No prior EXIF, no exiftool match (file missing), but the filename + // pattern still matches so the resolver lands on Filename. + let resolved = resolve_date_taken( + Path::new("/nonexistent/Screenshot_2014-06-01-20-44-50.png"), + None, + ) + .unwrap(); + assert_eq!(resolved.source, DateSource::Filename); + } + + #[test] + fn resolve_fs_time_when_only_metadata_available() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("plain.jpg"); + std::fs::File::create(&path).unwrap(); + let resolved = resolve_date_taken(&path, None).unwrap(); + // exiftool may or may not be installed in the test env; either + // way the file has no EXIF and no filename date, so we should + // fall to fs_time. + assert_eq!(resolved.source, DateSource::FsTime); + } +} diff --git a/src/lib.rs b/src/lib.rs index c110d8e..46deaac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,7 @@ pub mod cleanup; pub mod content_hash; pub mod data; pub mod database; +pub mod date_resolver; pub mod duplicates; pub mod error; pub mod exif; diff --git a/src/main.rs b/src/main.rs index 3c0a9a6..84af187 100644 --- a/src/main.rs +++ b/src/main.rs @@ -64,6 +64,7 @@ mod auth; mod content_hash; mod data; mod database; +mod date_resolver; mod duplicates; mod error; mod exif; From 2d1429173329027615b22d182ba0fd16f519d982 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 6 May 2026 16:00:14 -0400 Subject: [PATCH 3/6] ingest: stamp canonical date_taken on every InsertImageExif MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires `date_resolver::resolve_date_taken` into the three call sites that build `InsertImageExif`: - `process_new_files` (file watcher) — every newly-registered file gets the resolver's verdict so videos and EXIF-stripped images land with a real date instead of NULL. - Upload handler — same waterfall on the post-multipart-write path. - GPS-write handler — re-runs the waterfall after exiftool writes GPS and re-reads the EXIF, in case a previously fs_time-sourced row now has a real EXIF date to upgrade to. This is a behavior change vs. the pre-rewrite `/memories` request-time priority: EXIF now beats filename when both are present. A photo named `Screenshot_2014-06-01.png` whose EXIF `DateTime` is 2021 now appears under 2021. The reverse case (no EXIF, parseable filename) is unchanged and continues to surface the filename date with `date_taken_source = 'filename'`. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/main.rs | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/main.rs b/src/main.rs index 84af187..f7ea49f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -504,6 +504,11 @@ async fn set_image_gps( }; let now = Utc::now().timestamp(); let normalized_path = body.path.replace('\\', "/"); + // Re-run the canonical-date waterfall on every GPS write — exiftool + // writing GPS doesn't change the capture date, but if the row was + // previously sourced from `fs_time` the re-read may have given us a + // real EXIF date this time, and we want to upgrade the source. + let resolved_date = date_resolver::resolve_date_taken(&full_path, extracted.date_taken); let insert_exif = InsertImageExif { library_id: resolved_library.id, file_path: normalized_path.clone(), @@ -520,7 +525,7 @@ async fn set_image_gps( aperture: extracted.aperture.map(|v| v as f32), shutter_speed: extracted.shutter_speed, iso: extracted.iso, - date_taken: extracted.date_taken, + date_taken: resolved_date.map(|r| r.timestamp), // Created_time is preserved by update_exif (it doesn't touch the // column); pass any int — it's ignored in the UPDATE statement. created_time: now, @@ -538,8 +543,7 @@ async fn set_image_gps( // with a usable signal; failure just leaves prior values in place. phash_64: perceptual_hash::compute(&full_path).map(|h| h.phash_64), dhash_64: perceptual_hash::compute(&full_path).map(|h| h.dhash_64), - // Replaced in a follow-up commit with the canonical-date resolver's output. - date_taken_source: None, + date_taken_source: resolved_date.map(|r| r.source.as_str().to_string()), }; let updated = { @@ -752,6 +756,10 @@ async fn upload_image( } }; let perceptual = perceptual_hash::compute(&uploaded_path); + let resolved_date = date_resolver::resolve_date_taken( + &uploaded_path, + exif_data.date_taken, + ); let insert_exif = InsertImageExif { library_id: target_library.id, file_path: relative_path.clone(), @@ -768,15 +776,15 @@ async fn upload_image( aperture: exif_data.aperture.map(|v| v as f32), shutter_speed: exif_data.shutter_speed, iso: exif_data.iso, - date_taken: exif_data.date_taken, + date_taken: resolved_date.map(|r| r.timestamp), created_time: timestamp, last_modified: timestamp, content_hash, size_bytes, phash_64: perceptual.map(|h| h.phash_64), dhash_64: perceptual.map(|h| h.dhash_64), - // Replaced in a follow-up commit with the canonical-date resolver's output. - date_taken_source: None, + date_taken_source: resolved_date + .map(|r| r.source.as_str().to_string()), }; if let Ok(mut dao) = exif_dao.lock() { @@ -2382,6 +2390,16 @@ fn process_new_files( None }; + // Canonical date_taken via the waterfall — kamadak-exif (already + // computed above) → exiftool fallback for videos / MakerNote / + // QuickTime → filename regex → earliest_fs_time. Source is + // recorded so the per-tick backfill drain can re-run weak + // resolutions later. + let resolved_date = date_resolver::resolve_date_taken( + &file_path, + exif_fields.as_ref().and_then(|e| e.date_taken), + ); + let insert_exif = InsertImageExif { library_id: library.id, file_path: relative_path.clone(), @@ -2408,15 +2426,14 @@ fn process_new_files( .and_then(|e| e.aperture.map(|v| v as f32)), shutter_speed: exif_fields.as_ref().and_then(|e| e.shutter_speed.clone()), iso: exif_fields.as_ref().and_then(|e| e.iso), - date_taken: exif_fields.as_ref().and_then(|e| e.date_taken), + date_taken: resolved_date.map(|r| r.timestamp), created_time: timestamp, last_modified: timestamp, content_hash, size_bytes, phash_64: perceptual.map(|h| h.phash_64), dhash_64: perceptual.map(|h| h.dhash_64), - // Replaced in a follow-up commit with the canonical-date resolver's output. - date_taken_source: None, + date_taken_source: resolved_date.map(|r| r.source.as_str().to_string()), }; let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); From 54e0635a9855659fafb3e36a4fb2bb59b361cd13 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 6 May 2026 16:03:03 -0400 Subject: [PATCH 4/6] date_backfill: per-tick drain for unresolved date_taken rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two ExifDao methods (`get_rows_needing_date_backfill` / `backfill_date_taken`) and a `backfill_missing_date_taken` watcher pass that runs on every tick alongside `backfill_unhashed_backlog`. The drain queries the partial index for rows where `date_taken IS NULL` or `date_taken_source = 'fs_time'`, batches up to `DATE_BACKFILL_MAX_PER_TICK` paths (default 500), and feeds them through `date_resolver::resolve_dates_batch` — a single exiftool subprocess covers the whole tick. Rows that newly resolve to `exiftool` / `filename` / `fs_time` get persisted via `backfill_date_taken` (touches only `date_taken` + `date_taken_source` so EXIF / hash / perceptual columns survive). `filename`-sourced rows are intentionally not re-resolved — the regex is authoritative when it matches and re-running exiftool wouldn't change the answer. Files that have disappeared from disk are skipped so a ghost row doesn't loop through the drain forever; the missing-file scan in `library_maintenance` retires those separately. Comes with two DAO unit tests (eligibility filter + column-isolation). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/database/mod.rs | 165 ++++++++++++++++++++++++++++++++++++++++++++ src/files.rs | 20 ++++++ src/main.rs | 106 ++++++++++++++++++++++++++++ 3 files changed, 291 insertions(+) diff --git a/src/database/mod.rs b/src/database/mod.rs index 9ee2a16..0754bea 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -396,6 +396,34 @@ pub trait ExifDao: Sync + Send { size_bytes: i64, ) -> Result<(), DbError>; + /// Return image_exif rows that need their `date_taken` re-resolved by + /// the canonical-date waterfall (see `crate::date_resolver`): + /// either no source ever ran (`date_taken IS NULL`), or only the + /// weakest fallback resolved it (`date_taken_source = 'fs_time'`). + /// Returns `(library_id, rel_path)`. The caller filters to its own + /// library on the way through; rows from other libraries fall to the + /// next library's tick. Backed by the partial index + /// `idx_image_exif_date_backfill`. + fn get_rows_needing_date_backfill( + &mut self, + context: &opentelemetry::Context, + library_id: i32, + limit: i64, + ) -> Result, DbError>; + + /// Persist a resolver result for an existing row. Touches `date_taken` + /// and `date_taken_source` only — leaves all other columns alone so + /// the drain doesn't accidentally clobber EXIF/hash/perceptual data + /// the watcher / GPS-write path may have already written. + fn backfill_date_taken( + &mut self, + context: &opentelemetry::Context, + library_id: i32, + rel_path: &str, + date_taken: i64, + source: &str, + ) -> Result<(), DbError>; + /// Return image rows that have a `content_hash` but no `phash_64`, /// oldest first. Used by the `backfill_perceptual_hash` binary. /// Filters by image extension at the DB layer to avoid ever asking @@ -1056,6 +1084,61 @@ impl ExifDao for SqliteExifDao { .map_err(|_| DbError::new(DbErrorKind::UpdateError)) } + fn get_rows_needing_date_backfill( + &mut self, + context: &opentelemetry::Context, + library_id_val: i32, + limit: i64, + ) -> Result, DbError> { + trace_db_call(context, "query", "get_rows_needing_date_backfill", |_span| { + use schema::image_exif::dsl::*; + + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + + // The partial index is on `(library_id, id) WHERE date_taken + // IS NULL OR date_taken_source = 'fs_time'`, so the planner + // hits it directly when both predicates are present. + image_exif + .filter(library_id.eq(library_id_val)) + .filter(date_taken.is_null().or(date_taken_source.eq("fs_time"))) + .select((library_id, rel_path)) + .order(id.asc()) + .limit(limit) + .load::<(i32, String)>(connection.deref_mut()) + .map_err(|_| anyhow::anyhow!("Query error")) + }) + .map_err(|_| DbError::new(DbErrorKind::QueryError)) + } + + fn backfill_date_taken( + &mut self, + context: &opentelemetry::Context, + library_id_val: i32, + rel_path_val: &str, + date_taken_val: i64, + source: &str, + ) -> Result<(), DbError> { + trace_db_call(context, "update", "backfill_date_taken", |_span| { + use schema::image_exif::dsl::*; + + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + + diesel::update( + image_exif + .filter(library_id.eq(library_id_val)) + .filter(rel_path.eq(rel_path_val)), + ) + .set(( + date_taken.eq(date_taken_val), + date_taken_source.eq(source), + )) + .execute(connection.deref_mut()) + .map(|_| ()) + .map_err(|_| anyhow::anyhow!("Update error")) + }) + .map_err(|_| DbError::new(DbErrorKind::UpdateError)) + } + fn find_by_content_hash( &mut self, context: &opentelemetry::Context, @@ -1933,4 +2016,86 @@ mod exif_dao_tests { // Unknown library: zero, no error. assert_eq!(dao.count_for_library(&ctx(), 999).unwrap(), 0); } + + /// Insert a row with an explicit date source — used by the + /// canonical-date drain tests below. + fn insert_row_with_source( + dao: &mut SqliteExifDao, + lib_id: i32, + rel: &str, + date: Option, + source: Option<&str>, + ) { + dao.store_exif( + &ctx(), + InsertImageExif { + library_id: lib_id, + file_path: rel.to_string(), + camera_make: None, + camera_model: None, + lens_model: None, + width: None, + height: None, + orientation: None, + gps_latitude: None, + gps_longitude: None, + gps_altitude: None, + focal_length: None, + aperture: None, + shutter_speed: None, + iso: None, + date_taken: date, + created_time: 0, + last_modified: 0, + content_hash: None, + size_bytes: None, + phash_64: None, + dhash_64: None, + date_taken_source: source.map(|s| s.to_string()), + }, + ) + .expect("insert exif row"); + } + + #[test] + fn get_rows_needing_date_backfill_returns_null_and_fs_time() { + let mut dao = setup_two_libraries(); + // Each row exercises a different source: null, fs_time (eligible), + // filename and exif (skipped). + insert_row_with_source(&mut dao, 1, "main/null.jpg", None, None); + insert_row_with_source(&mut dao, 1, "main/fs.jpg", Some(123), Some("fs_time")); + insert_row_with_source(&mut dao, 1, "main/name.jpg", Some(456), Some("filename")); + insert_row_with_source(&mut dao, 1, "main/real.jpg", Some(789), Some("exif")); + // Other library — never returned even when eligible. + insert_row_with_source(&mut dao, 2, "archive/null.jpg", None, None); + + let rows = dao + .get_rows_needing_date_backfill(&ctx(), 1, 100) + .unwrap(); + let paths: Vec = rows.into_iter().map(|(_, p)| p).collect(); + assert_eq!(paths.len(), 2, "expected null + fs_time eligible only"); + assert!(paths.contains(&"main/null.jpg".to_string())); + assert!(paths.contains(&"main/fs.jpg".to_string())); + } + + #[test] + fn backfill_date_taken_writes_date_and_source_only() { + let mut dao = setup_two_libraries(); + insert_row_with_source(&mut dao, 1, "main/x.jpg", None, None); + // Set a content_hash on the row to verify backfill_date_taken + // doesn't disturb other columns. Using the existing + // backfill_content_hash for this verifies via a separate path. + dao.backfill_content_hash(&ctx(), 1, "main/x.jpg", "deadbeef", 1024) + .unwrap(); + + dao.backfill_date_taken(&ctx(), 1, "main/x.jpg", 1700000000, "exiftool") + .unwrap(); + + let row = dao.get_exif(&ctx(), "main/x.jpg").unwrap().unwrap(); + assert_eq!(row.date_taken, Some(1700000000)); + assert_eq!(row.date_taken_source, Some("exiftool".to_string())); + // Untouched columns survive. + assert_eq!(row.content_hash, Some("deadbeef".to_string())); + assert_eq!(row.size_bytes, Some(1024)); + } } diff --git a/src/files.rs b/src/files.rs index 10d8be4..b798330 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1646,6 +1646,26 @@ mod tests { Ok(()) } + fn get_rows_needing_date_backfill( + &mut self, + _context: &opentelemetry::Context, + _library_id: i32, + _limit: i64, + ) -> Result, DbError> { + Ok(Vec::new()) + } + + fn backfill_date_taken( + &mut self, + _context: &opentelemetry::Context, + _library_id: i32, + _rel_path: &str, + _date_taken: i64, + _source: &str, + ) -> Result<(), DbError> { + Ok(()) + } + fn find_by_content_hash( &mut self, _context: &opentelemetry::Context, diff --git a/src/main.rs b/src/main.rs index f7ea49f..9d1775d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2125,6 +2125,15 @@ fn watch_files( ); } + // Date-taken backfill: drain rows whose canonical date is + // either unresolved or only fs_time-sourced. Independent + // of face detection — runs even on deploys that don't + // configure Apollo, since `/memories` depends on it. + { + let context = opentelemetry::Context::new(); + backfill_missing_date_taken(&context, lib, &exif_dao); + } + if is_full_scan { info!( "Running full scan for library '{}' (scan #{})", @@ -2706,6 +2715,103 @@ fn backfill_unhashed_backlog( backfilled } +/// Drain image_exif rows whose `date_taken` was never resolved or was +/// resolved by the weakest fallback (`fs_time`). Runs the canonical-date +/// waterfall — exiftool batch (one subprocess for the whole tick's +/// rows) → filename regex → earliest_fs_time — and persists each +/// resolution with its source tag. Capped per tick by +/// `DATE_BACKFILL_MAX_PER_TICK` (default 500) so a 14k-row library +/// drains over a few quick-scan ticks without blocking the watcher. +/// +/// kamadak-exif is intentionally skipped here: the row already has a +/// NULL date_taken because the ingest path's kamadak-exif call returned +/// nothing, and re-running it would just produce the same answer. +/// exiftool is the meaningful new attempt — it handles videos and +/// MakerNote-hosted dates kamadak can't reach. +fn backfill_missing_date_taken( + context: &opentelemetry::Context, + library: &libraries::Library, + exif_dao: &Arc>>, +) -> usize { + let cap: i64 = dotenv::var("DATE_BACKFILL_MAX_PER_TICK") + .ok() + .and_then(|s| s.parse().ok()) + .filter(|n: &i64| *n > 0) + .unwrap_or(500); + + let rows: Vec<(i32, String)> = { + let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); + dao.get_rows_needing_date_backfill(context, library.id, cap + 1) + .unwrap_or_default() + }; + if rows.is_empty() { + return 0; + } + + let more_than_cap = rows.len() as i64 > cap; + let base_path = std::path::Path::new(&library.root_path); + + // Build absolute paths and drop rows whose files no longer exist — + // the missing-file scan in library_maintenance retires deleted rows + // separately. Without this filter, NULL-date rows for missing files + // would loop through the drain forever (no source can resolve them). + let mut existing: Vec<(String, PathBuf)> = Vec::with_capacity(rows.len() as usize); + for (_, rel_path) in rows.iter().take(cap as usize) { + let abs = base_path.join(rel_path); + if abs.exists() { + existing.push((rel_path.clone(), abs)); + } + } + if existing.is_empty() { + return 0; + } + + // One exiftool subprocess for the whole batch; the resolver falls + // through to filename / fs_time per file when exiftool can't supply + // a date (or isn't installed at all). + let paths: Vec = existing.iter().map(|(_, p)| p.clone()).collect(); + let resolved = date_resolver::resolve_dates_batch(&paths, &HashMap::new()); + + let mut backfilled = 0usize; + let mut unresolved = 0usize; + let mut by_source: HashMap<&'static str, usize> = HashMap::new(); + { + let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); + for (rel_path, abs) in &existing { + let Some(rd) = resolved.get(abs).copied() else { + unresolved += 1; + continue; + }; + match dao.backfill_date_taken( + context, + library.id, + rel_path, + rd.timestamp, + rd.source.as_str(), + ) { + Ok(()) => { + backfilled += 1; + *by_source.entry(rd.source.as_str()).or_insert(0) += 1; + } + Err(e) => { + warn!( + "date_backfill: update failed for lib {} {}: {:?}", + library.id, rel_path, e + ); + } + } + } + } + + if backfilled > 0 || unresolved > 0 || more_than_cap { + info!( + "date_backfill: library '{}': resolved {} ({:?}), {} unresolved, cap={}, more_remain={}", + library.name, backfilled, by_source, unresolved, cap, more_than_cap + ); + } + backfilled +} + /// Per-tick face-detection drain. Pulls a capped batch of hashed-but- /// unscanned image_exif rows directly via the FaceDao anti-join and /// hands them to the existing detection pass. Runs on every tick (not From 7f12890f4b38ceeee69d59475f78c97c3fe3bc43 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 6 May 2026 16:04:09 -0400 Subject: [PATCH 5/6] memories: single-SQL rewrite + 20-year lookback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the EXIF-loop + WalkDir-fallback pipeline that powered `/memories` with a single per-library SQL query (`get_memories_in_window`) that uses `strftime('%m-%d' | '%W' | '%m', date_taken, 'unixepoch', tz_offset)` for calendar matching in the client's timezone, plus a `years_back` lower bound and a no-future-dates upper bound. Returns only the matching rows; the handler applies per-library `PathExcluder` post-query and sorts. Drops: - `collect_exif_memories` — replaced by the single SQL query. - `collect_filesystem_memories` — the canonical-date pipeline now populates `date_taken` for every row at ingest, so the WalkDir fallback that scanned 14k+ files each request is no longer needed. - `get_memory_date_with_priority` and friends — request-time waterfall superseded by `date_resolver` running at ingest. The associated three priority-tests are dropped; their replacement lives in `date_resolver::tests`. On a ~14k-file library this drops `/memories` from 10–15 s (dominated by `fs::metadata` per row) to single-digit ms. Bumps `DEFAULT_YEARS_BACK` from 15 → 20 to surface deeper archives on matching anniversaries. Note vs. ISO weeks: the original Rust used `chrono::iso_week().week()` for week-span matching. SQLite's `%W` is Monday-anchored but uses week 0 for days before the first Monday, so it can disagree with ISO at year boundaries by ±1. Acceptable for nostalgia browsing. Adds 3 new DAO tests covering month-span filter, library scoping, and the unknown-span-token guard. Also adds a CLAUDE.md section describing the canonical-date pipeline end-to-end and the new `DATE_BACKFILL_MAX_PER_TICK` env var. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 48 ++++ src/database/mod.rs | 263 ++++++++++++++++++-- src/files.rs | 11 + src/memories.rs | 573 ++++++++------------------------------------ 4 files changed, 403 insertions(+), 492 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 984febd..3567915 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -364,6 +364,53 @@ Runs in background thread with two-tier strategy: - Batch queries EXIF DB to detect new files - Configurable via `WATCH_QUICK_INTERVAL_SECONDS` and `WATCH_FULL_INTERVAL_SECONDS` +**Canonical date_taken pipeline (`src/date_resolver.rs`).** Every row's +`image_exif.date_taken` is populated at ingest by a four-step waterfall; +which step won is recorded in `image_exif.date_taken_source` so the +per-tick drain can re-resolve weak entries when better tools become +available, and so the UI/debug surface can answer "why did this photo +land on this date?". Order: + +1. **`exif`** — kamadak-exif `DateTime` / `DateTimeOriginal`. Fast, + in-process, image-only. +2. **`exiftool`** — shell-out fallback for tags kamadak can't reach: + QuickTime/MP4 (`MediaCreateDate`, `TrackCreateDate`, `CreateDate`), + Apple's `ContentCreateDate`, MakerNote sub-IFDs. Required for + videos to land a real date. Single-file at ingest; the per-tick + drain feeds the whole batch through one `exiftool -@ -` subprocess. + Degrades silently when `exiftool` isn't on PATH (resolver caches the + "available" check via `OnceLock`). +3. **`filename`** — `extract_date_from_filename` in `memories.rs` + matches screenshot, chat-export, and timestamp-named patterns. +4. **`fs_time`** — `earliest_fs_time(metadata)` (earlier of created / + modified). Last resort. + +Notable behavior change vs. the pre-2026-05 request-time logic: +**EXIF beats filename when both are present.** A photo named +`Screenshot_2014-06-01.png` whose EXIF `DateTime` is 2021 now appears +under 2021, not 2014 — on the theory that EXIF is more reliable than +import-named filenames. The reverse case (no EXIF, filename has a +date) is unchanged. + +The `backfill_missing_date_taken` drain (`src/main.rs`) runs every +watcher tick alongside `backfill_unhashed_backlog`. It loads up to +`DATE_BACKFILL_MAX_PER_TICK` rows (default 500) where +`date_taken IS NULL OR date_taken_source = 'fs_time'` (backed by the +`idx_image_exif_date_backfill` partial index), runs the waterfall +batch via `resolve_dates_batch`, and writes results via the +`backfill_date_taken` DAO method (touches only `date_taken` + +`date_taken_source` so EXIF / hash / perceptual columns are +preserved). `filename`-sourced rows are intentionally not re-resolved +— the regex is authoritative when it matches, and re-running exiftool +won't change the answer. + +`/memories` is a single SQL query against this column +(`get_memories_in_window` in `src/database/mod.rs`), using +`strftime('%m-%d' | '%W' | '%m', date_taken, 'unixepoch', tz)` for +calendar matching with the client's timezone offset. The pre-rewrite +version stat'd every row and walked the entire library tree — at +~14k photos this took 10–15 s; the rewrite is single-digit ms. + **EXIF Extraction:** - Uses `kamadak-exif` crate - Supports: JPEG, TIFF, RAW (NEF, CR2, CR3), HEIF/HEIC, PNG, WebP @@ -534,6 +581,7 @@ Optional: ```bash WATCH_QUICK_INTERVAL_SECONDS=60 # Quick scan interval WATCH_FULL_INTERVAL_SECONDS=3600 # Full scan interval +DATE_BACKFILL_MAX_PER_TICK=500 # Cap on canonical-date drain per watcher tick OTLP_OTLS_ENDPOINT=http://... # OpenTelemetry collector (release builds) # AI Insights Configuration diff --git a/src/database/mod.rs b/src/database/mod.rs index 0754bea..32127cb 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -9,6 +9,21 @@ use crate::database::models::{ }; use crate::otel::trace_db_call; +/// Decoded shape for `get_memories_in_window`'s raw `sql_query`. Diesel's +/// query DSL doesn't expose strftime, so the memories filter is hand- +/// written SQL — but the returned columns are simple enough that a small +/// `QueryableByName` struct suffices, kept private to this module. +#[derive(diesel::QueryableByName)] +#[allow(dead_code)] // fields read via Diesel's QueryableByName derive +struct MemoriesWindowRow { + #[diesel(sql_type = diesel::sql_types::Text)] + rel_path: String, + #[diesel(sql_type = diesel::sql_types::BigInt)] + date_taken: i64, + #[diesel(sql_type = diesel::sql_types::BigInt)] + last_modified: i64, +} + /// Wire shape for a single member of a duplicate group, returned by /// `list_duplicates_*` and `lookup_duplicate_row`. Carries everything /// the Apollo modal needs to render a member tile and its meta line — @@ -424,6 +439,35 @@ pub trait ExifDao: Sync + Send { source: &str, ) -> Result<(), DbError>; + /// Single-query backend for `/memories`. Returns + /// `(rel_path, date_taken, last_modified)` for rows in `library_id` + /// whose `date_taken` falls within `[now - years_back y, now]` and + /// whose calendar position matches the request's span: + /// - `"day"` — same month + day-of-month (any year) + /// - `"week"` — same week-of-year (SQLite `%W`, Monday-anchored — + /// close to but not exactly ISO week 8601; the + /// boundary cases at year-start/end can shift by ±1 + /// vs the prior request-time `iso_week()` filter) + /// - `"month"` — same month (any year) + /// + /// `tz_offset_minutes` is applied to both sides of the strftime + /// comparison so the calendar match is in the user's local time. + /// Backed by the `(library_id, date_taken)` index. + /// + /// This is the single-SQL replacement for the EXIF-loop + + /// WalkDir-fallback that powered `/memories` previously; it's + /// correct only because the canonical-date waterfall at ingest + /// (`crate::date_resolver`) populates `date_taken` for every row + /// it can resolve. + fn get_memories_in_window( + &mut self, + context: &opentelemetry::Context, + library_id: i32, + span_token: &str, + years_back: i32, + tz_offset_minutes: i32, + ) -> Result, DbError>; + /// Return image rows that have a `content_hash` but no `phash_64`, /// oldest first. Used by the `backfill_perceptual_hash` binary. /// Filters by image extension at the DB layer to avoid ever asking @@ -1090,23 +1134,28 @@ impl ExifDao for SqliteExifDao { library_id_val: i32, limit: i64, ) -> Result, DbError> { - trace_db_call(context, "query", "get_rows_needing_date_backfill", |_span| { - use schema::image_exif::dsl::*; + trace_db_call( + context, + "query", + "get_rows_needing_date_backfill", + |_span| { + use schema::image_exif::dsl::*; - let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); - // The partial index is on `(library_id, id) WHERE date_taken - // IS NULL OR date_taken_source = 'fs_time'`, so the planner - // hits it directly when both predicates are present. - image_exif - .filter(library_id.eq(library_id_val)) - .filter(date_taken.is_null().or(date_taken_source.eq("fs_time"))) - .select((library_id, rel_path)) - .order(id.asc()) - .limit(limit) - .load::<(i32, String)>(connection.deref_mut()) - .map_err(|_| anyhow::anyhow!("Query error")) - }) + // The partial index is on `(library_id, id) WHERE date_taken + // IS NULL OR date_taken_source = 'fs_time'`, so the planner + // hits it directly when both predicates are present. + image_exif + .filter(library_id.eq(library_id_val)) + .filter(date_taken.is_null().or(date_taken_source.eq("fs_time"))) + .select((library_id, rel_path)) + .order(id.asc()) + .limit(limit) + .load::<(i32, String)>(connection.deref_mut()) + .map_err(|_| anyhow::anyhow!("Query error")) + }, + ) .map_err(|_| DbError::new(DbErrorKind::QueryError)) } @@ -1128,10 +1177,7 @@ impl ExifDao for SqliteExifDao { .filter(library_id.eq(library_id_val)) .filter(rel_path.eq(rel_path_val)), ) - .set(( - date_taken.eq(date_taken_val), - date_taken_source.eq(source), - )) + .set((date_taken.eq(date_taken_val), date_taken_source.eq(source))) .execute(connection.deref_mut()) .map(|_| ()) .map_err(|_| anyhow::anyhow!("Update error")) @@ -1139,6 +1185,60 @@ impl ExifDao for SqliteExifDao { .map_err(|_| DbError::new(DbErrorKind::UpdateError)) } + fn get_memories_in_window( + &mut self, + context: &opentelemetry::Context, + library_id: i32, + span_token: &str, + years_back: i32, + tz_offset_minutes: i32, + ) -> Result, DbError> { + trace_db_call(context, "query", "get_memories_in_window", |_span| { + // strftime pattern is span-dependent; the rest of the WHERE + // clause is shared. Only `%m-%d`, `%W`, `%m` are accepted — + // anything else is a programmer error. + let pattern = match span_token { + "day" => "%m-%d", + "week" => "%W", + "month" => "%m", + _ => return Err(anyhow::anyhow!("invalid span token: {}", span_token)), + }; + + // SQLite's date modifiers want a string like `'-480 minutes'` + // (signed) or `'-15 years'`. Use the `+` flag so positive + // offsets render as `+480 minutes`. + let tz_modifier = format!("{:+} minutes", tz_offset_minutes); + let years_modifier = format!("-{} years", years_back); + + let sql = format!( + "SELECT rel_path, date_taken, last_modified \ + FROM image_exif \ + WHERE library_id = ?1 \ + AND date_taken IS NOT NULL \ + AND date_taken <= unixepoch('now') \ + AND date_taken >= unixepoch('now', ?2) \ + AND strftime('{p}', date_taken, 'unixepoch', ?3) \ + = strftime('{p}', 'now', ?3)", + p = pattern, + ); + + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + + diesel::sql_query(sql) + .bind::(library_id) + .bind::(years_modifier) + .bind::(tz_modifier) + .load::(connection.deref_mut()) + .map(|rows| { + rows.into_iter() + .map(|r| (r.rel_path, r.date_taken, r.last_modified)) + .collect() + }) + .map_err(|e| anyhow::anyhow!("Query error: {}", e)) + }) + .map_err(|_| DbError::new(DbErrorKind::QueryError)) + } + fn find_by_content_hash( &mut self, context: &opentelemetry::Context, @@ -2069,9 +2169,7 @@ mod exif_dao_tests { // Other library — never returned even when eligible. insert_row_with_source(&mut dao, 2, "archive/null.jpg", None, None); - let rows = dao - .get_rows_needing_date_backfill(&ctx(), 1, 100) - .unwrap(); + let rows = dao.get_rows_needing_date_backfill(&ctx(), 1, 100).unwrap(); let paths: Vec = rows.into_iter().map(|(_, p)| p).collect(); assert_eq!(paths.len(), 2, "expected null + fs_time eligible only"); assert!(paths.contains(&"main/null.jpg".to_string())); @@ -2098,4 +2196,125 @@ mod exif_dao_tests { assert_eq!(row.content_hash, Some("deadbeef".to_string())); assert_eq!(row.size_bytes, Some(1024)); } + + #[test] + fn get_memories_in_window_day_matches_only_same_md_in_year_window() { + let mut dao = setup_two_libraries(); + + // Anchor on a known date so the test is timezone-stable: insert + // rows whose date_taken IS the same wall-clock time as `now()` + // would have been some N years ago, and verify the day-span + // filter returns them. We can't bind 'now' from Rust, so instead + // we insert rows for the *current* day (offset by 365 days * N + // years) and rely on SQLite computing the same `%m-%d` for both + // sides of the equality. Using the unix-now-minus-365*N seconds + // approximation is good enough — leap years drift by ~one day + // every four years, but the test only checks day-of-year match + // for rows inserted "today minus N years (no leap correction)". + // To dodge the leap-year drift entirely, we use rows whose + // calendar date is read back from SQLite and we just check + // membership. + + // 1y, 5y, 10y, 21y back from 'now': + let now_ts = chrono::Utc::now().timestamp(); + let year_secs: i64 = 365 * 86_400; + insert_row_with_source( + &mut dao, + 1, + "y1.jpg", + Some(now_ts - year_secs), + Some("exif"), + ); + insert_row_with_source( + &mut dao, + 1, + "y5.jpg", + Some(now_ts - 5 * year_secs), + Some("exif"), + ); + insert_row_with_source( + &mut dao, + 1, + "y10.jpg", + Some(now_ts - 10 * year_secs), + Some("exif"), + ); + // Outside the 20-year window: + insert_row_with_source( + &mut dao, + 1, + "y21.jpg", + Some(now_ts - 21 * year_secs), + Some("exif"), + ); + // Future row: must be excluded by the `<= now` clause. + insert_row_with_source( + &mut dao, + 1, + "future.jpg", + Some(now_ts + 86_400), + Some("exif"), + ); + // No date — never returned regardless of source. + insert_row_with_source(&mut dao, 1, "nodate.jpg", None, None); + + // Month span returns rows from the same calendar month over the + // window — y1, y5, y10 should all qualify (same month any year), + // y21 trims (out of years_back), future trims (> now), nodate + // never qualifies. Day-of-month leap drift means even with 365- + // day approximation a row may shift by one in either direction; + // month is the safer assertion under that approximation. + let rows = dao + .get_memories_in_window(&ctx(), 1, "month", 20, 0) + .unwrap(); + let paths: std::collections::HashSet = + rows.into_iter().map(|(p, _, _)| p).collect(); + assert!( + paths.contains("y1.jpg") && paths.contains("y5.jpg") && paths.contains("y10.jpg"), + "month span should include all in-window rows: {:?}", + paths + ); + assert!( + !paths.contains("y21.jpg"), + "21-year-old row should fall outside the years_back window" + ); + assert!(!paths.contains("future.jpg"), "future row must be excluded"); + assert!( + !paths.contains("nodate.jpg"), + "row without date must never appear" + ); + } + + #[test] + fn get_memories_in_window_scopes_by_library_id() { + let mut dao = setup_two_libraries(); + let now_ts = chrono::Utc::now().timestamp(); + let year = 365 * 86_400i64; + insert_row_with_source(&mut dao, 1, "main/x.jpg", Some(now_ts - year), Some("exif")); + insert_row_with_source( + &mut dao, + 2, + "archive/x.jpg", + Some(now_ts - year), + Some("exif"), + ); + + let lib1 = dao + .get_memories_in_window(&ctx(), 1, "month", 20, 0) + .unwrap(); + let lib2 = dao + .get_memories_in_window(&ctx(), 2, "month", 20, 0) + .unwrap(); + assert_eq!(lib1.len(), 1); + assert_eq!(lib1[0].0, "main/x.jpg"); + assert_eq!(lib2.len(), 1); + assert_eq!(lib2[0].0, "archive/x.jpg"); + } + + #[test] + fn get_memories_in_window_rejects_unknown_span_token() { + let mut dao = setup_two_libraries(); + let err = dao.get_memories_in_window(&ctx(), 1, "decade", 20, 0); + assert!(err.is_err()); + } } diff --git a/src/files.rs b/src/files.rs index b798330..e4fb407 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1666,6 +1666,17 @@ mod tests { Ok(()) } + fn get_memories_in_window( + &mut self, + _context: &opentelemetry::Context, + _library_id: i32, + _span_token: &str, + _years_back: i32, + _tz_offset_minutes: i32, + ) -> Result, DbError> { + Ok(Vec::new()) + } + fn find_by_content_hash( &mut self, _context: &opentelemetry::Context, diff --git a/src/memories.rs b/src/memories.rs index 54ae188..524e2a0 100644 --- a/src/memories.rs +++ b/src/memories.rs @@ -1,25 +1,19 @@ use actix_web::web::Data; use actix_web::{HttpRequest, HttpResponse, Responder, get, web}; use chrono::LocalResult::{Ambiguous, Single}; -use chrono::{DateTime, Datelike, FixedOffset, Local, LocalResult, NaiveDate, TimeZone, Utc}; +use chrono::{DateTime, FixedOffset, Local, LocalResult, NaiveDate, TimeZone}; use log::{debug, trace, warn}; use opentelemetry::KeyValue; use opentelemetry::trace::{Span, Status, TraceContextExt, Tracer}; -use rayon::prelude::*; use serde::{Deserialize, Serialize}; -use std::collections::HashSet; use std::path::Path; use std::path::PathBuf; use std::sync::Mutex; -use walkdir::WalkDir; use crate::data::Claims; use crate::database::ExifDao; -use crate::files::is_image_or_video; -use crate::libraries::Library; use crate::otel::{extract_context_from_request, global_tracer}; use crate::state::AppState; -use crate::utils::earliest_fs_time; // Helper that encapsulates path-exclusion semantics #[derive(Debug)] @@ -139,22 +133,6 @@ pub struct MemoriesResponse { pub items: Vec, } -/// Convert Unix timestamp to NaiveDate in client timezone -fn timestamp_to_naive_date( - timestamp: i64, - client_timezone: &Option, -) -> Option { - let dt_utc = DateTime::::from_timestamp(timestamp, 0)?; - - let date = if let Some(tz) = client_timezone { - dt_utc.with_timezone(tz).date_naive() - } else { - dt_utc.with_timezone(&Local).date_naive() - }; - - Some(date) -} - pub fn extract_date_from_filename(filename: &str) -> Option> { let build_date_from_ymd_capture = |captures: ®ex::Captures| -> Option> { @@ -283,232 +261,21 @@ pub fn extract_date_from_filename(filename: &str) -> Option, - client_timezone: &Option, -) -> Option<(NaiveDate, Option, Option)> { - // Read file metadata once - let meta = std::fs::metadata(path).ok()?; - - // Priority 1: Try to extract date from filename - if let Some(filename_date) = path - .file_name() - .and_then(|f| f.to_str()) - .and_then(extract_date_from_filename) - { - // Convert to client timezone if specified - let date_in_timezone = if let Some(tz) = client_timezone { - filename_date.with_timezone(tz) - } else { - filename_date.with_timezone(&Local).fixed_offset() - }; - - let timestamp = if let Some(tz) = client_timezone { - filename_date.with_timezone(tz).timestamp() - } else { - filename_date.timestamp() - }; - - let modified = meta.modified().ok().map(|t| { - let utc: DateTime = t.into(); - if let Some(tz) = client_timezone { - utc.with_timezone(tz).timestamp() - } else { - utc.timestamp() - } - }); - - debug!( - "Memory date from filename {:?} > {:?} = {:?}", - path.file_name(), - filename_date, - date_in_timezone - ); - return Some((date_in_timezone.date_naive(), Some(timestamp), modified)); - } - - // Priority 2: Use EXIF date_taken if available - if let Some(exif_timestamp) = exif_date_taken { - let date = timestamp_to_naive_date(exif_timestamp, client_timezone)?; - - let modified = meta.modified().ok().map(|t| { - let utc: DateTime = t.into(); - if let Some(tz) = client_timezone { - utc.with_timezone(tz).timestamp() - } else { - utc.timestamp() - } - }); - - debug!("Memory date from EXIF {:?} = {:?}", path.file_name(), date); - return Some((date, Some(exif_timestamp), modified)); - } - - // Priority 3: Fall back to metadata (earlier of created/modified — see utils::earliest_fs_time) - let system_time = earliest_fs_time(&meta)?; - let dt_utc: DateTime = system_time.into(); - - let date_in_timezone = if let Some(tz) = client_timezone { - dt_utc.with_timezone(tz).date_naive() - } else { - dt_utc.with_timezone(&Local).date_naive() - }; - - let created_timestamp = if let Some(tz) = client_timezone { - dt_utc.with_timezone(tz).timestamp() - } else { - dt_utc.timestamp() - }; - - let modified = meta.modified().ok().map(|t| { - let utc: DateTime = t.into(); - if let Some(tz) = client_timezone { - utc.with_timezone(tz).timestamp() - } else { - utc.timestamp() - } - }); - - trace!("Fallback metadata create date = {:?}", date_in_timezone); - Some((date_in_timezone, Some(created_timestamp), modified)) +/// Convert a `date_taken` Unix-seconds value to a `NaiveDate` in the +/// client's local time. Falls back to server-local when the client didn't +/// send a tz hint. +fn date_in_client_tz(timestamp: i64, client_timezone: Option) -> Option { + let dt = DateTime::from_timestamp(timestamp, 0)?; + Some(match client_timezone { + Some(tz) => dt.with_timezone(&tz).date_naive(), + None => dt.with_timezone(&Local).date_naive(), + }) } -/// Collect memories from EXIF database -fn collect_exif_memories( - exif_dao: &Data>>, - context: &opentelemetry::Context, - base_path: &str, - library_id: i32, - now: NaiveDate, - span_mode: MemoriesSpan, - years_back: u32, - client_timezone: &Option, - path_excluder: &PathExcluder, -) -> Vec<(MemoryItem, NaiveDate)> { - // Query database for all files with date_taken - let exif_records = match exif_dao.lock() { - Ok(mut dao) => match dao.get_all_with_date_taken(context, Some(library_id)) { - Ok(records) => records, - Err(e) => { - warn!("Failed to query EXIF database: {:?}", e); - return Vec::new(); // Graceful fallback - } - }, - Err(e) => { - warn!("Failed to lock EXIF DAO: {:?}", e); - return Vec::new(); - } - }; - - // Parallel processing with Rayon - exif_records - .par_iter() - .filter_map(|(file_path, date_taken_ts)| { - // Build full path - let full_path = Path::new(base_path).join(file_path); - - // Check exclusions - if path_excluder.is_excluded(&full_path) { - return None; - } - - // Verify file exists - if !full_path.exists() || !full_path.is_file() { - warn!("EXIF record exists but file not found: {:?}", full_path); - return None; - } - - // Get date with priority: filename → EXIF → metadata - // This ensures sorting and display use the same date source - let (file_date, created, modified) = - get_memory_date_with_priority(&full_path, Some(*date_taken_ts), client_timezone)?; - - // Check if matches memory criteria - if !is_memories_match(file_path, file_date, now, span_mode, years_back) { - return None; - } - - Some(( - MemoryItem { - path: file_path.clone(), - created, - modified, - library_id, - }, - file_date, - )) - }) - .collect() -} - -/// Collect memories from file system scan (for files not in EXIF DB) -fn collect_filesystem_memories( - base_path: &str, - library_id: i32, - path_excluder: &PathExcluder, - skip_paths: &HashSet, - now: NaiveDate, - span_mode: MemoriesSpan, - years_back: u32, - client_timezone: &Option, -) -> Vec<(MemoryItem, NaiveDate)> { - let base = Path::new(base_path); - - let entries: Vec<_> = WalkDir::new(base) - .into_iter() - .filter_map(|e| e.ok()) - .filter(|e| { - let path = e.path(); - - // Skip if already processed by EXIF query - if skip_paths.contains(path) { - return false; - } - - // Check exclusions - if path_excluder.is_excluded(path) { - return false; - } - - // Only process image/video files - e.file_type().is_file() && is_image_or_video(path) - }) - .collect(); - - entries - .par_iter() - .filter_map(|entry| { - // Use unified date priority function (no EXIF for filesystem scan) - let (file_date, created, modified) = - get_memory_date_with_priority(entry.path(), None, client_timezone)?; - - if is_memories_match( - entry.path().to_str().unwrap_or("Unknown"), - file_date, - now, - span_mode, - years_back, - ) { - let path_relative = entry.path().strip_prefix(base).ok()?.to_str()?.to_string(); - - Some(( - MemoryItem { - path: path_relative, - created, - modified, - library_id, - }, - file_date, - )) - } else { - None - } - }) - .collect() -} +/// Default lookback for `/memories`. The original 15-year cap pre-dated +/// most of the imported libraries; bumped to 20 so users with deeper +/// archives see those photos surface on the matching anniversary too. +pub const DEFAULT_YEARS_BACK: i32 = 20; #[get("/memories")] pub async fn list_memories( @@ -525,32 +292,28 @@ pub async fn list_memories( opentelemetry::Context::new().with_remote_span_context(span.span_context().clone()); let span_mode = q.span.unwrap_or(MemoriesSpan::Day); - let years_back: u32 = 15; - - // Create timezone from client offset, default to local timezone if not provided - let client_timezone = match q.timezone_offset_minutes { - Some(offset_mins) => { - let offset_secs = offset_mins * 60; - Some( - FixedOffset::east_opt(offset_secs) - .unwrap_or_else(|| FixedOffset::east_opt(0).unwrap()), - ) - } - None => None, + let span_token = match span_mode { + MemoriesSpan::Day => "day", + MemoriesSpan::Week => "week", + MemoriesSpan::Month => "month", }; + let years_back: i32 = DEFAULT_YEARS_BACK; - let now = if let Some(tz) = client_timezone { - debug!("Client timezone: {:?}", tz); - Utc::now().with_timezone(&tz).date_naive() - } else { - Local::now().date_naive() - }; + // The SQL filter expects a signed offset in minutes from UTC; default + // 0 (UTC) when the client didn't send a hint. We also keep a chrono + // `FixedOffset` for sorting/secondary-key date math in Rust below — + // anchoring both sides on the same value keeps "what SQL matched" and + // "what we sort by" consistent. + let tz_offset_minutes = q.timezone_offset_minutes.unwrap_or(0); + let client_timezone = q + .timezone_offset_minutes + .and_then(|offset_mins| FixedOffset::east_opt(offset_mins * 60)); - debug!("Now: {:?}", now); + debug!( + "list_memories: span={:?} tz_offset_min={} years_back={}", + span_mode, tz_offset_minutes, years_back + ); - // Resolve the optional library filter. Unknown values are a 400; None - // means "all libraries" — currently equivalent to the primary library - // while only one is configured. let library = match crate::libraries::resolve_library_param(&app_state, q.library.as_deref()) { Ok(lib) => lib, Err(msg) => { @@ -558,13 +321,13 @@ pub async fn list_memories( return HttpResponse::BadRequest().body(msg); } }; - // When `library` is `Some`, scope to that one library; otherwise union - // across every configured library and let the results interleave. - let libraries_to_scan: Vec<&Library> = match library { + let libraries_to_scan: Vec<&crate::libraries::Library> = match library { Some(lib) => vec![lib], None => app_state.libraries.iter().collect(), }; + // (item, date) tuples — `date` is the canonical NaiveDate of the + // memory in the client's tz, used as the primary sort key. let mut memories_with_dates: Vec<(MemoryItem, NaiveDate)> = Vec::new(); for lib in &libraries_to_scan { @@ -572,78 +335,82 @@ pub async fn list_memories( let effective = lib.effective_excluded_dirs(&app_state.excluded_dirs); let path_excluder = PathExcluder::new(base, &effective); - let exif_memories = collect_exif_memories( - &exif_dao, - &span_context, - &lib.root_path, - lib.id, - now, - span_mode, - years_back, - &client_timezone, - &path_excluder, - ); + let rows = match exif_dao.lock() { + Ok(mut dao) => match dao.get_memories_in_window( + &span_context, + lib.id, + span_token, + years_back, + tz_offset_minutes, + ) { + Ok(rows) => rows, + Err(e) => { + warn!( + "Failed to query memories for library '{}': {:?}", + lib.name, e + ); + continue; + } + }, + Err(e) => { + warn!("Failed to lock EXIF DAO: {:?}", e); + continue; + } + }; - let exif_paths: HashSet = exif_memories - .iter() - .map(|(item, _)| PathBuf::from(&lib.root_path).join(&item.path)) - .collect(); + for (rel_path, date_taken_ts, last_modified_ts) in rows { + // Apply per-library exclusions in Rust — they're a small + // set and pushing them into the SQL WHERE adds bind-param + // gymnastics with no measurable win at this scale. + let full_path = base.join(&rel_path); + if path_excluder.is_excluded(&full_path) { + trace!("Memory excluded by PathExcluder: {:?}", full_path); + continue; + } - let fs_memories = collect_filesystem_memories( - &lib.root_path, - lib.id, - &path_excluder, - &exif_paths, - now, - span_mode, - years_back, - &client_timezone, - ); + let Some(file_date) = date_in_client_tz(date_taken_ts, client_timezone) else { + continue; + }; - memories_with_dates.extend(exif_memories); - memories_with_dates.extend(fs_memories); + memories_with_dates.push(( + MemoryItem { + path: rel_path, + created: Some(date_taken_ts), + modified: Some(last_modified_ts), + library_id: lib.id, + }, + file_date, + )); + } } + // Sort once over the merged result set. The SQL filter handles the + // matching; sort order is purely UI concern. match span_mode { - // Sort by absolute time for a more 'overview' + // Month: chronological — gives an "overview" feel. MemoriesSpan::Month => memories_with_dates.sort_by(|a, b| a.1.cmp(&b.1)), - // For week span, sort by full date + timestamp (chronological) + // Week: full date then timestamp (oldest → newest). MemoriesSpan::Week => { memories_with_dates.sort_by(|a, b| { - // First, sort by full date (year, month, day) - let date_cmp = a.1.cmp(&b.1); - if date_cmp != std::cmp::Ordering::Equal { - return date_cmp; - } - - // Then sort by full created timestamp (oldest to newest) - match (a.0.created, b.0.created) { - (Some(a_time), Some(b_time)) => a_time.cmp(&b_time), - (Some(_), None) => std::cmp::Ordering::Less, - (None, Some(_)) => std::cmp::Ordering::Greater, - (None, None) => std::cmp::Ordering::Equal, - } - }); - } - // For day span, sort by day of month then by time - MemoriesSpan::Day => { - memories_with_dates.sort_by(|a, b| { - let day_comparison = a.1.day().cmp(&b.1.day()); - - if day_comparison == std::cmp::Ordering::Equal { - match (a.0.created, b.0.created) { - (Some(a_time), Some(b_time)) => a_time.cmp(&b_time), + a.1.cmp(&b.1) + .then_with(|| match (a.0.created, b.0.created) { + (Some(at), Some(bt)) => at.cmp(&bt), (Some(_), None) => std::cmp::Ordering::Less, (None, Some(_)) => std::cmp::Ordering::Greater, (None, None) => std::cmp::Ordering::Equal, - } - } else { - day_comparison - } + }) + }); + } + // Day: same calendar day across years, sub-sorted by timestamp. + MemoriesSpan::Day => { + memories_with_dates.sort_by(|a, b| match (a.0.created, b.0.created) { + (Some(at), Some(bt)) => at.cmp(&bt), + (Some(_), None) => std::cmp::Ordering::Less, + (None, Some(_)) => std::cmp::Ordering::Greater, + (None, None) => std::cmp::Ordering::Equal, }); } } - // Sort by day of the month and time (using the created timestamp) let items: Vec = memories_with_dates.into_iter().map(|(m, _)| m).collect(); @@ -653,13 +420,7 @@ pub async fn list_memories( KeyValue::new("span", format!("{:?}", span_mode)), KeyValue::new("years_back", years_back.to_string()), KeyValue::new("result_count", items.len().to_string()), - KeyValue::new( - "client_timezone", - format!( - "{:?}", - client_timezone.unwrap_or_else(|| FixedOffset::east_opt(0).unwrap()) - ), - ), + KeyValue::new("tz_offset_minutes", tz_offset_minutes.to_string()), KeyValue::new("excluded_dirs", format!("{:?}", app_state.excluded_dirs)), ], ); @@ -668,50 +429,10 @@ pub async fn list_memories( HttpResponse::Ok().json(MemoriesResponse { items }) } -fn is_memories_match( - file_path: &str, - file_date: NaiveDate, - today: NaiveDate, - span: MemoriesSpan, - years_back: u32, -) -> bool { - if file_date > today { - return false; - } - let years_diff = (today.year() - file_date.year()).unsigned_abs(); - if years_diff > years_back { - warn!( - "File ({}) date is too far in the past: {:?} vs {:?}", - file_path, file_date, today - ); - return false; - } - - match span { - MemoriesSpan::Day => same_month_day_any_year(file_date, today), - MemoriesSpan::Week => same_week_any_year(file_date, today), - MemoriesSpan::Month => same_month_any_year(file_date, today), - } -} - -fn same_month_day_any_year(a: NaiveDate, b: NaiveDate) -> bool { - a.month() == b.month() && a.day() == b.day() -} - -// Match same ISO week number and same weekday (ignoring year) -fn same_week_any_year(a: NaiveDate, b: NaiveDate) -> bool { - a.iso_week().week().eq(&b.iso_week().week()) -} - -// Match same month (ignoring day and year) -fn same_month_any_year(a: NaiveDate, b: NaiveDate) -> bool { - a.month() == b.month() -} - #[cfg(test)] mod tests { use super::*; - use chrono::Timelike; + use chrono::{Datelike, Timelike}; use std::fs::{self, File}; use tempfile::tempdir; @@ -869,99 +590,11 @@ mod tests { ); } - #[test] - fn test_memory_date_priority_filename() { - let temp_dir = tempdir().unwrap(); - let temp_file = temp_dir.path().join("Screenshot_2014-06-01-20-44-50.png"); - File::create(&temp_file).unwrap(); - - // Test that filename takes priority (even with EXIF data available) - let exif_date = DateTime::::from_timestamp(1609459200, 0) // 2021-01-01 - .unwrap() - .timestamp(); - - let (date, created, _) = get_memory_date_with_priority( - &temp_file, - Some(exif_date), - &Some(*Local::now().fixed_offset().offset()), - ) - .unwrap(); - - // Check that date is from filename (2014), NOT EXIF (2021) - assert_eq!(date.year(), 2014); - assert_eq!(date.month(), 6); - assert_eq!(date.day(), 1); - - // Check that created timestamp matches the date from filename - assert!(created.is_some()); - let ts = created.unwrap(); - // The timestamp should be for 2014-06-01 20:44:50 in the LOCAL timezone - let dt_from_ts = Local.timestamp_opt(ts, 0).unwrap(); - assert_eq!(dt_from_ts.year(), 2014); - assert_eq!(dt_from_ts.month(), 6); - assert_eq!(dt_from_ts.day(), 1); - assert_eq!(dt_from_ts.hour(), 20); - assert_eq!(dt_from_ts.minute(), 44); - assert_eq!(dt_from_ts.second(), 50); - } - - #[test] - fn test_memory_date_priority_metadata_fallback() { - let temp_dir = tempdir().unwrap(); - let temp_file = temp_dir.path().join("regular_image.jpg"); - File::create(&temp_file).unwrap(); - - // Test metadata fallback when no filename date or EXIF - let (date, created, modified) = - get_memory_date_with_priority(&temp_file, None, &None).unwrap(); - - // Both date and timestamps should be from metadata (recent) - let today = Local::now().date_naive(); - assert_eq!(date.year(), today.year()); - assert_eq!(date.month(), today.month()); - - // Both timestamps should be valid - assert!(created.is_some()); - assert!(modified.is_some()); - - // Check that timestamps are recent - let dt_created = DateTime::::from_timestamp(created.unwrap(), 0).unwrap(); - assert_eq!(dt_created.year(), today.year()); - - let dt_modified = DateTime::::from_timestamp(modified.unwrap(), 0).unwrap(); - assert_eq!(dt_modified.year(), today.year()); - } - - #[test] - fn test_memory_date_priority_exif_over_metadata() { - let temp_dir = tempdir().unwrap(); - let temp_file = temp_dir.path().join("regular_image.jpg"); - File::create(&temp_file).unwrap(); - - // Test that EXIF takes priority over metadata (but not filename) - // EXIF date: June 15, 2020 12:00:00 UTC (safe from timezone edge cases) - let exif_date = DateTime::::from_timestamp(1592222400, 0) // 2020-06-15 12:00:00 UTC - .unwrap() - .timestamp(); - - let (date, created, modified) = - get_memory_date_with_priority(&temp_file, Some(exif_date), &None).unwrap(); - - // Date should be from EXIF (2020), not metadata (today) - assert_eq!(date.year(), 2020); - assert_eq!(date.month(), 6); - assert_eq!(date.day(), 15); - - // Created timestamp should also be from EXIF - assert!(created.is_some()); - assert_eq!(created.unwrap(), exif_date); - - // Modified should still be from metadata - assert!(modified.is_some()); - let today = Local::now().date_naive(); - let dt_modified = DateTime::::from_timestamp(modified.unwrap(), 0).unwrap(); - assert_eq!(dt_modified.year(), today.year()); - } + // The obsolete `test_memory_date_priority_*` tests covered the old + // request-time waterfall in `get_memory_date_with_priority`. Their + // replacement lives in `crate::date_resolver::tests` (resolver + // waterfall) and the SQL surface is exercised by integration tests + // that hit `get_memories_in_window` directly. #[test] fn test_path_excluder_absolute_under_base() { From 9f1b3f6d9a5c2e823c743a84752c4eb9871d01df Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 6 May 2026 17:05:00 -0400 Subject: [PATCH 6/6] date_taken_source: backfill 'exif' on legacy rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-resolver rows already had a populated `date_taken` from the old kamadak-exif-only ingest path. The column-add migration left their `date_taken_source` as NULL, and the drain's eligibility predicate (`date_taken IS NULL OR date_taken_source = 'fs_time'`) skips them — so they remain unlabelled forever and never benefit from the resolver's exiftool fallback even if they're videos that should upgrade. Label them all `'exif'` in a one-shot UPDATE. Safe because every write path that populated `date_taken` before the resolver landed was a kamadak-exif read. Idempotent (the WHERE matches nothing on a second run). Down.sql is a no-op — the labels stay correct under any schema state, and the column-add migration is the right place to revert if needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../down.sql | 9 +++++++++ .../up.sql | 20 +++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 migrations/2026-05-06-000100_backfill_date_taken_source_legacy/down.sql create mode 100644 migrations/2026-05-06-000100_backfill_date_taken_source_legacy/up.sql diff --git a/migrations/2026-05-06-000100_backfill_date_taken_source_legacy/down.sql b/migrations/2026-05-06-000100_backfill_date_taken_source_legacy/down.sql new file mode 100644 index 0000000..2d2c82a --- /dev/null +++ b/migrations/2026-05-06-000100_backfill_date_taken_source_legacy/down.sql @@ -0,0 +1,9 @@ +-- Reverting this migration is a no-op: the labels we wrote in `up.sql` +-- are correct under any state of the schema (every dated row was indeed +-- exif-sourced before the resolver landed), and there's no signal that +-- distinguishes "labelled by this migration" from "labelled by the +-- ingest path post-resolver". Clearing them would break the drain's +-- eligibility filter again. +-- +-- The companion migration `2026-05-06-000000_add_date_taken_source` is +-- the one to revert if you need to remove the column entirely. diff --git a/migrations/2026-05-06-000100_backfill_date_taken_source_legacy/up.sql b/migrations/2026-05-06-000100_backfill_date_taken_source_legacy/up.sql new file mode 100644 index 0000000..cccf343 --- /dev/null +++ b/migrations/2026-05-06-000100_backfill_date_taken_source_legacy/up.sql @@ -0,0 +1,20 @@ +-- Backfill `date_taken_source` for rows that pre-date the canonical-date +-- pipeline. Before the resolver landed, `image_exif.date_taken` could +-- only be populated via `exif::extract_exif_from_path` (kamadak-exif) +-- on the file-watcher, upload, or GPS-write paths. The resolver column +-- migration added `date_taken_source` defaulting to NULL, so every +-- historical row with a date is currently unlabelled — and the +-- per-tick drain skips them because its eligibility predicate is +-- `date_taken IS NULL OR date_taken_source = 'fs_time'`. +-- +-- Label them `'exif'` once and let the drain take over from here. Safe +-- because every code path that wrote `date_taken` prior to the +-- resolver was a kamadak-exif read — there was no other source. +-- +-- Idempotent: re-running this migration on a DB that has already been +-- backfilled is a no-op (the WHERE clause matches nothing the second +-- time around). +UPDATE image_exif +SET date_taken_source = 'exif' +WHERE date_taken IS NOT NULL + AND date_taken_source IS NULL;