//! Canonical `date_taken` resolution for ingest and the per-tick backfill //! drain. //! //! The waterfall (in order; first hit wins): //! //! 1. **kamadak-exif** — fast in-process EXIF read. Already done by //! `exif::extract_exif_from_path` for image-bearing formats; callers //! pass that result in via `prior_exif_date` so we don't re-parse. //! 2. **exiftool** — shell-out fallback that reaches places kamadak-exif //! can't: QuickTime/MP4 (`MediaCreateDate`, `TrackCreateDate`, //! `CreateDate`), Apple's `ContentCreateDate`, MakerNote sub-IFDs. //! Required for videos to land a real date; degrades silently when //! `exiftool` isn't on PATH. //! 3. **filename regex** — `memories::extract_date_from_filename` covers //! common screenshot / chat-export / timestamp-named patterns. //! 4. **earliest filesystem time** — `utils::earliest_fs_time` picks the //! earlier of created / modified, which on copied-from-backup files is //! a better proxy for content age than either alone. //! //! `DateSource` records which step won so the per-tick drain can re-resolve //! weak sources (`fs_time`) once exiftool becomes available, and so the //! UI/debug surface can answer "why does this photo show up under this //! date." Note that the previous `/memories` request-time logic preferred //! filename even when EXIF was present; this resolver inverts that — EXIF //! is authoritative when it exists, on the theory that an EXIF //! `DateTimeOriginal` is more reliable than a filename pattern that may //! reflect import time rather than capture time. use std::collections::HashMap; use std::io::Write; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use std::sync::OnceLock; use chrono::{DateTime, Utc}; use log::{debug, trace, warn}; use serde::Deserialize; use crate::utils::earliest_fs_time; #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DateSource { /// kamadak-exif read DateTime/DateTimeOriginal directly. Exif, /// exiftool fallback caught a video / MakerNote / QuickTime tag. Exiftool, /// `extract_date_from_filename` matched a known pattern. Filename, /// Fell through to `earliest_fs_time(metadata)`. FsTime, } impl DateSource { pub fn as_str(self) -> &'static str { match self { DateSource::Exif => "exif", DateSource::Exiftool => "exiftool", DateSource::Filename => "filename", DateSource::FsTime => "fs_time", } } } #[derive(Copy, Clone, Debug)] pub struct ResolvedDate { pub timestamp: i64, pub source: DateSource, } /// Resolve the canonical date for a single file, given an already-extracted /// kamadak-exif date if available. Returns `None` only if every step in the /// waterfall fails — for files that exist on disk this should be vanishingly /// rare (the fs-time fallback alone almost always succeeds). pub fn resolve_date_taken(path: &Path, prior_exif_date: Option) -> Option { if let Some(ts) = prior_exif_date { return Some(ResolvedDate { timestamp: ts, source: DateSource::Exif, }); } if let Some(ts) = exiftool_date_single(path) { return Some(ResolvedDate { timestamp: ts, source: DateSource::Exiftool, }); } if let Some(dt) = path .file_name() .and_then(|f| f.to_str()) .and_then(crate::memories::extract_date_from_filename) { return Some(ResolvedDate { timestamp: dt.timestamp(), source: DateSource::Filename, }); } if let Ok(meta) = std::fs::metadata(path) && let Some(t) = earliest_fs_time(&meta) { let dt: DateTime = t.into(); return Some(ResolvedDate { timestamp: dt.timestamp(), source: DateSource::FsTime, }); } None } /// Batch waterfall. exiftool runs once over the whole batch (single /// subprocess); everything else is per-file and runs only on misses. /// `prior_exif_dates` lets the caller pass in already-known kamadak dates /// keyed by path; entries without a prior date fall through to exiftool /// and the rest of the waterfall. /// /// The per-tick backfill drain is the primary caller — it loads ~500 rows /// at a time and uses one exiftool subprocess to drain the lot. pub fn resolve_dates_batch( paths: &[PathBuf], prior_exif_dates: &HashMap, ) -> HashMap { let mut out: HashMap = HashMap::new(); let mut needs_exiftool: Vec<&Path> = Vec::with_capacity(paths.len()); for path in paths { if let Some(&ts) = prior_exif_dates.get(path) { out.insert( path.clone(), ResolvedDate { timestamp: ts, source: DateSource::Exif, }, ); } else { needs_exiftool.push(path.as_path()); } } if !needs_exiftool.is_empty() { let exiftool_results = exiftool_dates_batch(&needs_exiftool); for path in &needs_exiftool { if let Some(&ts) = exiftool_results.get(*path) { out.insert( path.to_path_buf(), ResolvedDate { timestamp: ts, source: DateSource::Exiftool, }, ); } } } for path in paths { if out.contains_key(path) { continue; } if let Some(dt) = path .file_name() .and_then(|f| f.to_str()) .and_then(crate::memories::extract_date_from_filename) { out.insert( path.clone(), ResolvedDate { timestamp: dt.timestamp(), source: DateSource::Filename, }, ); continue; } if let Ok(meta) = std::fs::metadata(path) && let Some(t) = earliest_fs_time(&meta) { let dt: DateTime = t.into(); out.insert( path.clone(), ResolvedDate { timestamp: dt.timestamp(), source: DateSource::FsTime, }, ); } } out } /// Tag priority for exiftool extraction. First non-zero value wins. /// /// Photos: `DateTimeOriginal` (original capture) and `SubSecDateTimeOriginal` /// are most authoritative. `CreateDate` is a common alias and a sane fallback. /// /// Videos: `MediaCreateDate` / `TrackCreateDate` are the QuickTime/MP4 /// timestamps. `ContentCreateDate` is Apple's iOS-set tag; it often /// reflects local capture time on iPhone exports better than the others. /// /// Notably absent: `FileModifyDate` / `FileAccessDate` — those are /// filesystem-derived and the resolver covers them via the `fs_time` /// fallback. Letting exiftool pull them here would mask "no real EXIF /// date" with a `source = exiftool` row that's no better than fs_time. const EXIFTOOL_DATE_TAGS: &[&str] = &[ "DateTimeOriginal", "SubSecDateTimeOriginal", "CreateDate", "MediaCreateDate", "TrackCreateDate", "ContentCreateDate", ]; /// Cache the "exiftool exists on PATH" check across the process lifetime so /// the per-tick backfill doesn't fork a doomed subprocess every iteration on /// deploys without exiftool installed. fn exiftool_available() -> bool { static AVAIL: OnceLock = OnceLock::new(); *AVAIL.get_or_init(|| { let ok = Command::new("exiftool") .arg("-ver") .stdout(Stdio::null()) .stderr(Stdio::null()) .status() .map(|s| s.success()) .unwrap_or(false); if !ok { warn!("exiftool not on PATH; date_taken waterfall skips that step"); } ok }) } /// One-file exiftool invocation. Used by the upload + GPS-write paths, /// which deal with one file at a time. The batch path uses /// `exiftool_dates_batch` so we don't pay subprocess startup per row. fn exiftool_date_single(path: &Path) -> Option { if !exiftool_available() { return None; } let mut cmd = Command::new("exiftool"); cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2"); for tag in EXIFTOOL_DATE_TAGS { cmd.arg(format!("-{}", tag)); } cmd.arg(path); let output = cmd.output().ok()?; if !output.status.success() { trace!("exiftool exited non-zero for {:?}", path); return None; } parse_exiftool_json(&output.stdout) .into_iter() .next() .map(|(_, ts)| ts) } /// Drain a batch via a single exiftool subprocess. Paths are fed on stdin /// via `-@ -`, so the argv stays short regardless of batch size — safe for /// libraries with very long path components. fn exiftool_dates_batch(paths: &[&Path]) -> HashMap { let mut out = HashMap::new(); if paths.is_empty() || !exiftool_available() { return out; } let mut cmd = Command::new("exiftool"); cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2"); for tag in EXIFTOOL_DATE_TAGS { cmd.arg(format!("-{}", tag)); } cmd.arg("-@").arg("-"); cmd.stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::null()); let mut child = match cmd.spawn() { Ok(c) => c, Err(e) => { warn!("exiftool batch spawn failed: {}", e); return out; } }; if let Some(mut stdin) = child.stdin.take() { for p in paths { // exiftool's argfile reader treats each line as one path; OS // path bytes don't always survive a String round-trip, but // every path we get here originated from rel_path / root_path // strings already, so to-string-lossy is a non-event. if let Err(e) = writeln!(stdin, "{}", p.display()) { warn!("exiftool batch stdin write failed: {}", e); break; } } } let output = match child.wait_with_output() { Ok(o) => o, Err(e) => { warn!("exiftool batch wait failed: {}", e); return out; } }; if !output.status.success() { debug!( "exiftool batch exit status {:?}; partial output may still parse", output.status.code() ); } for (source, ts) in parse_exiftool_json(&output.stdout) { out.insert(PathBuf::from(source), ts); } out } /// One row per input file. exiftool emits any tag we asked for that was /// present, plus the `SourceFile` it was reading. Tags are JSON values /// because `-d %s` returns the timestamp as a *string* of digits, not a /// number, when the date parses; absent tags are simply missing keys. #[derive(Debug, Deserialize)] struct ExiftoolEntry { #[serde(rename = "SourceFile")] source_file: String, #[serde(rename = "DateTimeOriginal")] date_time_original: Option, #[serde(rename = "SubSecDateTimeOriginal")] sub_sec_date_time_original: Option, #[serde(rename = "CreateDate")] create_date: Option, #[serde(rename = "MediaCreateDate")] media_create_date: Option, #[serde(rename = "TrackCreateDate")] track_create_date: Option, #[serde(rename = "ContentCreateDate")] content_create_date: Option, } fn parse_exiftool_json(stdout: &[u8]) -> Vec<(String, i64)> { let entries: Vec = match serde_json::from_slice(stdout) { Ok(v) => v, Err(e) => { // Empty stdout on total failure isn't a parse error worth // logging at warn — the caller already noted the non-zero // exit status. if !stdout.is_empty() { warn!("exiftool JSON parse failed: {}", e); } return Vec::new(); } }; let mut out = Vec::with_capacity(entries.len()); for entry in entries { // Walk the priority list. exiftool sometimes returns the literal // string "0000:00:00 00:00:00" for missing-but-allocated date // slots; with `-d %s` that becomes the unix epoch (0). Reject // anything <= 0 so we fall through to the next tag. let tags = [ entry.date_time_original.as_ref(), entry.sub_sec_date_time_original.as_ref(), entry.create_date.as_ref(), entry.media_create_date.as_ref(), entry.track_create_date.as_ref(), entry.content_create_date.as_ref(), ]; let mut chosen: Option = None; for tag in tags.iter().flatten() { if let Some(ts) = coerce_to_unix_seconds(tag) && ts > 0 { chosen = Some(ts); break; } } if let Some(ts) = chosen { out.push((entry.source_file, ts)); } } out } /// `-d %s` should hand us a numeric string, but exiftool's JSON encoder /// will emit a number when the tag was defined as numeric in its lib — /// accept both shapes. fn coerce_to_unix_seconds(v: &serde_json::Value) -> Option { match v { serde_json::Value::String(s) => s.trim().parse::().ok(), serde_json::Value::Number(n) => n.as_i64(), _ => None, } } #[cfg(test)] mod tests { use super::*; #[test] fn parse_exiftool_json_picks_first_priority_tag() { let json = br#"[{ "SourceFile": "/lib/IMG.jpg", "DateTimeOriginal": "1500000000", "CreateDate": "1400000000" }]"#; let parsed = parse_exiftool_json(json); assert_eq!(parsed, vec![("/lib/IMG.jpg".to_string(), 1500000000)]); } #[test] fn parse_exiftool_json_falls_through_zeros() { // exiftool emits "0000:00:00 00:00:00" → unix epoch 0 with -d %s. // The resolver should skip those and pick the next tag. let json = br#"[{ "SourceFile": "/lib/clip.mov", "DateTimeOriginal": "0", "MediaCreateDate": "1500000000" }]"#; let parsed = parse_exiftool_json(json); assert_eq!(parsed, vec![("/lib/clip.mov".to_string(), 1500000000)]); } #[test] fn parse_exiftool_json_accepts_numeric_values() { let json = br#"[{ "SourceFile": "/lib/a.jpg", "CreateDate": 1234567890 }]"#; let parsed = parse_exiftool_json(json); assert_eq!(parsed, vec![("/lib/a.jpg".to_string(), 1234567890)]); } #[test] fn parse_exiftool_json_emits_nothing_when_no_tag_present() { let json = br#"[{"SourceFile": "/lib/no_dates.bin"}]"#; let parsed = parse_exiftool_json(json); assert!(parsed.is_empty()); } #[test] fn parse_exiftool_json_handles_multiple_entries() { let json = br#"[ {"SourceFile": "/lib/a.jpg", "DateTimeOriginal": "100"}, {"SourceFile": "/lib/b.jpg", "CreateDate": "200"} ]"#; let parsed = parse_exiftool_json(json); assert_eq!( parsed, vec![ ("/lib/a.jpg".to_string(), 100), ("/lib/b.jpg".to_string(), 200) ] ); } #[test] fn date_source_as_str_round_trip() { for src in [ DateSource::Exif, DateSource::Exiftool, DateSource::Filename, DateSource::FsTime, ] { assert!(!src.as_str().is_empty()); } } #[test] fn resolve_uses_prior_exif_when_present() { // Path doesn't need to exist when prior_exif_date short-circuits. let resolved = resolve_date_taken(Path::new("/nonexistent/file.jpg"), Some(1700000000)).unwrap(); assert_eq!(resolved.timestamp, 1700000000); assert_eq!(resolved.source, DateSource::Exif); } #[test] fn resolve_filename_when_no_exif_and_file_missing() { // No prior EXIF, no exiftool match (file missing), but the filename // pattern still matches so the resolver lands on Filename. let resolved = resolve_date_taken( Path::new("/nonexistent/Screenshot_2014-06-01-20-44-50.png"), None, ) .unwrap(); assert_eq!(resolved.source, DateSource::Filename); } #[test] fn resolve_fs_time_when_only_metadata_available() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().join("plain.jpg"); std::fs::File::create(&path).unwrap(); let resolved = resolve_date_taken(&path, None).unwrap(); // exiftool may or may not be installed in the test env; either // way the file has no EXIF and no filename date, so we should // fall to fs_time. assert_eq!(resolved.source, DateSource::FsTime); } }