diff --git a/src/date_resolver.rs b/src/date_resolver.rs new file mode 100644 index 0000000..8498cb8 --- /dev/null +++ b/src/date_resolver.rs @@ -0,0 +1,495 @@ +//! Canonical `date_taken` resolution for ingest and the per-tick backfill +//! drain. +//! +//! The waterfall (in order; first hit wins): +//! +//! 1. **kamadak-exif** — fast in-process EXIF read. Already done by +//! `exif::extract_exif_from_path` for image-bearing formats; callers +//! pass that result in via `prior_exif_date` so we don't re-parse. +//! 2. **exiftool** — shell-out fallback that reaches places kamadak-exif +//! can't: QuickTime/MP4 (`MediaCreateDate`, `TrackCreateDate`, +//! `CreateDate`), Apple's `ContentCreateDate`, MakerNote sub-IFDs. +//! Required for videos to land a real date; degrades silently when +//! `exiftool` isn't on PATH. +//! 3. **filename regex** — `memories::extract_date_from_filename` covers +//! common screenshot / chat-export / timestamp-named patterns. +//! 4. **earliest filesystem time** — `utils::earliest_fs_time` picks the +//! earlier of created / modified, which on copied-from-backup files is +//! a better proxy for content age than either alone. +//! +//! `DateSource` records which step won so the per-tick drain can re-resolve +//! weak sources (`fs_time`) once exiftool becomes available, and so the +//! UI/debug surface can answer "why does this photo show up under this +//! date." Note that the previous `/memories` request-time logic preferred +//! filename even when EXIF was present; this resolver inverts that — EXIF +//! is authoritative when it exists, on the theory that an EXIF +//! `DateTimeOriginal` is more reliable than a filename pattern that may +//! reflect import time rather than capture time. + +use std::collections::HashMap; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::sync::OnceLock; + +use chrono::{DateTime, Utc}; +use log::{debug, trace, warn}; +use serde::Deserialize; + +use crate::utils::earliest_fs_time; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum DateSource { + /// kamadak-exif read DateTime/DateTimeOriginal directly. + Exif, + /// exiftool fallback caught a video / MakerNote / QuickTime tag. + Exiftool, + /// `extract_date_from_filename` matched a known pattern. + Filename, + /// Fell through to `earliest_fs_time(metadata)`. + FsTime, +} + +impl DateSource { + pub fn as_str(self) -> &'static str { + match self { + DateSource::Exif => "exif", + DateSource::Exiftool => "exiftool", + DateSource::Filename => "filename", + DateSource::FsTime => "fs_time", + } + } +} + +#[derive(Copy, Clone, Debug)] +pub struct ResolvedDate { + pub timestamp: i64, + pub source: DateSource, +} + +/// Resolve the canonical date for a single file, given an already-extracted +/// kamadak-exif date if available. Returns `None` only if every step in the +/// waterfall fails — for files that exist on disk this should be vanishingly +/// rare (the fs-time fallback alone almost always succeeds). +pub fn resolve_date_taken(path: &Path, prior_exif_date: Option) -> Option { + if let Some(ts) = prior_exif_date { + return Some(ResolvedDate { + timestamp: ts, + source: DateSource::Exif, + }); + } + if let Some(ts) = exiftool_date_single(path) { + return Some(ResolvedDate { + timestamp: ts, + source: DateSource::Exiftool, + }); + } + if let Some(dt) = path + .file_name() + .and_then(|f| f.to_str()) + .and_then(crate::memories::extract_date_from_filename) + { + return Some(ResolvedDate { + timestamp: dt.timestamp(), + source: DateSource::Filename, + }); + } + if let Ok(meta) = std::fs::metadata(path) + && let Some(t) = earliest_fs_time(&meta) + { + let dt: DateTime = t.into(); + return Some(ResolvedDate { + timestamp: dt.timestamp(), + source: DateSource::FsTime, + }); + } + None +} + +/// Batch waterfall. exiftool runs once over the whole batch (single +/// subprocess); everything else is per-file and runs only on misses. +/// `prior_exif_dates` lets the caller pass in already-known kamadak dates +/// keyed by path; entries without a prior date fall through to exiftool +/// and the rest of the waterfall. +/// +/// The per-tick backfill drain is the primary caller — it loads ~500 rows +/// at a time and uses one exiftool subprocess to drain the lot. +pub fn resolve_dates_batch( + paths: &[PathBuf], + prior_exif_dates: &HashMap, +) -> HashMap { + let mut out: HashMap = HashMap::new(); + let mut needs_exiftool: Vec<&Path> = Vec::with_capacity(paths.len()); + + for path in paths { + if let Some(&ts) = prior_exif_dates.get(path) { + out.insert( + path.clone(), + ResolvedDate { + timestamp: ts, + source: DateSource::Exif, + }, + ); + } else { + needs_exiftool.push(path.as_path()); + } + } + + if !needs_exiftool.is_empty() { + let exiftool_results = exiftool_dates_batch(&needs_exiftool); + for path in &needs_exiftool { + if let Some(&ts) = exiftool_results.get(*path) { + out.insert( + path.to_path_buf(), + ResolvedDate { + timestamp: ts, + source: DateSource::Exiftool, + }, + ); + } + } + } + + for path in paths { + if out.contains_key(path) { + continue; + } + if let Some(dt) = path + .file_name() + .and_then(|f| f.to_str()) + .and_then(crate::memories::extract_date_from_filename) + { + out.insert( + path.clone(), + ResolvedDate { + timestamp: dt.timestamp(), + source: DateSource::Filename, + }, + ); + continue; + } + if let Ok(meta) = std::fs::metadata(path) + && let Some(t) = earliest_fs_time(&meta) + { + let dt: DateTime = t.into(); + out.insert( + path.clone(), + ResolvedDate { + timestamp: dt.timestamp(), + source: DateSource::FsTime, + }, + ); + } + } + + out +} + +/// Tag priority for exiftool extraction. First non-zero value wins. +/// +/// Photos: `DateTimeOriginal` (original capture) and `SubSecDateTimeOriginal` +/// are most authoritative. `CreateDate` is a common alias and a sane fallback. +/// +/// Videos: `MediaCreateDate` / `TrackCreateDate` are the QuickTime/MP4 +/// timestamps. `ContentCreateDate` is Apple's iOS-set tag; it often +/// reflects local capture time on iPhone exports better than the others. +/// +/// Notably absent: `FileModifyDate` / `FileAccessDate` — those are +/// filesystem-derived and the resolver covers them via the `fs_time` +/// fallback. Letting exiftool pull them here would mask "no real EXIF +/// date" with a `source = exiftool` row that's no better than fs_time. +const EXIFTOOL_DATE_TAGS: &[&str] = &[ + "DateTimeOriginal", + "SubSecDateTimeOriginal", + "CreateDate", + "MediaCreateDate", + "TrackCreateDate", + "ContentCreateDate", +]; + +/// Cache the "exiftool exists on PATH" check across the process lifetime so +/// the per-tick backfill doesn't fork a doomed subprocess every iteration on +/// deploys without exiftool installed. +fn exiftool_available() -> bool { + static AVAIL: OnceLock = OnceLock::new(); + *AVAIL.get_or_init(|| { + let ok = Command::new("exiftool") + .arg("-ver") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false); + if !ok { + warn!("exiftool not on PATH; date_taken waterfall skips that step"); + } + ok + }) +} + +/// One-file exiftool invocation. Used by the upload + GPS-write paths, +/// which deal with one file at a time. The batch path uses +/// `exiftool_dates_batch` so we don't pay subprocess startup per row. +fn exiftool_date_single(path: &Path) -> Option { + if !exiftool_available() { + return None; + } + let mut cmd = Command::new("exiftool"); + cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2"); + for tag in EXIFTOOL_DATE_TAGS { + cmd.arg(format!("-{}", tag)); + } + cmd.arg(path); + let output = cmd.output().ok()?; + if !output.status.success() { + trace!("exiftool exited non-zero for {:?}", path); + return None; + } + parse_exiftool_json(&output.stdout) + .into_iter() + .next() + .map(|(_, ts)| ts) +} + +/// Drain a batch via a single exiftool subprocess. Paths are fed on stdin +/// via `-@ -`, so the argv stays short regardless of batch size — safe for +/// libraries with very long path components. +fn exiftool_dates_batch(paths: &[&Path]) -> HashMap { + let mut out = HashMap::new(); + if paths.is_empty() || !exiftool_available() { + return out; + } + + let mut cmd = Command::new("exiftool"); + cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2"); + for tag in EXIFTOOL_DATE_TAGS { + cmd.arg(format!("-{}", tag)); + } + cmd.arg("-@").arg("-"); + cmd.stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()); + + let mut child = match cmd.spawn() { + Ok(c) => c, + Err(e) => { + warn!("exiftool batch spawn failed: {}", e); + return out; + } + }; + + if let Some(mut stdin) = child.stdin.take() { + for p in paths { + // exiftool's argfile reader treats each line as one path; OS + // path bytes don't always survive a String round-trip, but + // every path we get here originated from rel_path / root_path + // strings already, so to-string-lossy is a non-event. + if let Err(e) = writeln!(stdin, "{}", p.display()) { + warn!("exiftool batch stdin write failed: {}", e); + break; + } + } + } + + let output = match child.wait_with_output() { + Ok(o) => o, + Err(e) => { + warn!("exiftool batch wait failed: {}", e); + return out; + } + }; + if !output.status.success() { + debug!( + "exiftool batch exit status {:?}; partial output may still parse", + output.status.code() + ); + } + for (source, ts) in parse_exiftool_json(&output.stdout) { + out.insert(PathBuf::from(source), ts); + } + out +} + +/// One row per input file. exiftool emits any tag we asked for that was +/// present, plus the `SourceFile` it was reading. Tags are JSON values +/// because `-d %s` returns the timestamp as a *string* of digits, not a +/// number, when the date parses; absent tags are simply missing keys. +#[derive(Debug, Deserialize)] +struct ExiftoolEntry { + #[serde(rename = "SourceFile")] + source_file: String, + #[serde(rename = "DateTimeOriginal")] + date_time_original: Option, + #[serde(rename = "SubSecDateTimeOriginal")] + sub_sec_date_time_original: Option, + #[serde(rename = "CreateDate")] + create_date: Option, + #[serde(rename = "MediaCreateDate")] + media_create_date: Option, + #[serde(rename = "TrackCreateDate")] + track_create_date: Option, + #[serde(rename = "ContentCreateDate")] + content_create_date: Option, +} + +fn parse_exiftool_json(stdout: &[u8]) -> Vec<(String, i64)> { + let entries: Vec = match serde_json::from_slice(stdout) { + Ok(v) => v, + Err(e) => { + // Empty stdout on total failure isn't a parse error worth + // logging at warn — the caller already noted the non-zero + // exit status. + if !stdout.is_empty() { + warn!("exiftool JSON parse failed: {}", e); + } + return Vec::new(); + } + }; + + let mut out = Vec::with_capacity(entries.len()); + for entry in entries { + // Walk the priority list. exiftool sometimes returns the literal + // string "0000:00:00 00:00:00" for missing-but-allocated date + // slots; with `-d %s` that becomes the unix epoch (0). Reject + // anything <= 0 so we fall through to the next tag. + let tags = [ + entry.date_time_original.as_ref(), + entry.sub_sec_date_time_original.as_ref(), + entry.create_date.as_ref(), + entry.media_create_date.as_ref(), + entry.track_create_date.as_ref(), + entry.content_create_date.as_ref(), + ]; + let mut chosen: Option = None; + for tag in tags.iter().flatten() { + if let Some(ts) = coerce_to_unix_seconds(tag) + && ts > 0 + { + chosen = Some(ts); + break; + } + } + if let Some(ts) = chosen { + out.push((entry.source_file, ts)); + } + } + out +} + +/// `-d %s` should hand us a numeric string, but exiftool's JSON encoder +/// will emit a number when the tag was defined as numeric in its lib — +/// accept both shapes. +fn coerce_to_unix_seconds(v: &serde_json::Value) -> Option { + match v { + serde_json::Value::String(s) => s.trim().parse::().ok(), + serde_json::Value::Number(n) => n.as_i64(), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_exiftool_json_picks_first_priority_tag() { + let json = br#"[{ + "SourceFile": "/lib/IMG.jpg", + "DateTimeOriginal": "1500000000", + "CreateDate": "1400000000" + }]"#; + let parsed = parse_exiftool_json(json); + assert_eq!(parsed, vec![("/lib/IMG.jpg".to_string(), 1500000000)]); + } + + #[test] + fn parse_exiftool_json_falls_through_zeros() { + // exiftool emits "0000:00:00 00:00:00" → unix epoch 0 with -d %s. + // The resolver should skip those and pick the next tag. + let json = br#"[{ + "SourceFile": "/lib/clip.mov", + "DateTimeOriginal": "0", + "MediaCreateDate": "1500000000" + }]"#; + let parsed = parse_exiftool_json(json); + assert_eq!(parsed, vec![("/lib/clip.mov".to_string(), 1500000000)]); + } + + #[test] + fn parse_exiftool_json_accepts_numeric_values() { + let json = br#"[{ + "SourceFile": "/lib/a.jpg", + "CreateDate": 1234567890 + }]"#; + let parsed = parse_exiftool_json(json); + assert_eq!(parsed, vec![("/lib/a.jpg".to_string(), 1234567890)]); + } + + #[test] + fn parse_exiftool_json_emits_nothing_when_no_tag_present() { + let json = br#"[{"SourceFile": "/lib/no_dates.bin"}]"#; + let parsed = parse_exiftool_json(json); + assert!(parsed.is_empty()); + } + + #[test] + fn parse_exiftool_json_handles_multiple_entries() { + let json = br#"[ + {"SourceFile": "/lib/a.jpg", "DateTimeOriginal": "100"}, + {"SourceFile": "/lib/b.jpg", "CreateDate": "200"} + ]"#; + let parsed = parse_exiftool_json(json); + assert_eq!( + parsed, + vec![ + ("/lib/a.jpg".to_string(), 100), + ("/lib/b.jpg".to_string(), 200) + ] + ); + } + + #[test] + fn date_source_as_str_round_trip() { + for src in [ + DateSource::Exif, + DateSource::Exiftool, + DateSource::Filename, + DateSource::FsTime, + ] { + assert!(!src.as_str().is_empty()); + } + } + + #[test] + fn resolve_uses_prior_exif_when_present() { + // Path doesn't need to exist when prior_exif_date short-circuits. + let resolved = + resolve_date_taken(Path::new("/nonexistent/file.jpg"), Some(1700000000)).unwrap(); + assert_eq!(resolved.timestamp, 1700000000); + assert_eq!(resolved.source, DateSource::Exif); + } + + #[test] + fn resolve_filename_when_no_exif_and_file_missing() { + // No prior EXIF, no exiftool match (file missing), but the filename + // pattern still matches so the resolver lands on Filename. + let resolved = resolve_date_taken( + Path::new("/nonexistent/Screenshot_2014-06-01-20-44-50.png"), + None, + ) + .unwrap(); + assert_eq!(resolved.source, DateSource::Filename); + } + + #[test] + fn resolve_fs_time_when_only_metadata_available() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("plain.jpg"); + std::fs::File::create(&path).unwrap(); + let resolved = resolve_date_taken(&path, None).unwrap(); + // exiftool may or may not be installed in the test env; either + // way the file has no EXIF and no filename date, so we should + // fall to fs_time. + assert_eq!(resolved.source, DateSource::FsTime); + } +} diff --git a/src/lib.rs b/src/lib.rs index c110d8e..46deaac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,7 @@ pub mod cleanup; pub mod content_hash; pub mod data; pub mod database; +pub mod date_resolver; pub mod duplicates; pub mod error; pub mod exif; diff --git a/src/main.rs b/src/main.rs index 3c0a9a6..84af187 100644 --- a/src/main.rs +++ b/src/main.rs @@ -64,6 +64,7 @@ mod auth; mod content_hash; mod data; mod database; +mod date_resolver; mod duplicates; mod error; mod exif;