date_resolver: canonical date_taken waterfall with exiftool fallback
New module that consolidates the four-step ingest waterfall: kamadak-exif (already in process via the caller's prior result) → exiftool fallback → filename regex → earliest_fs_time. Each step is tagged with a `DateSource` so the caller can persist provenance. The exiftool fallback is what makes videos and MakerNote-hosted dates land at all — kamadak-exif can't read QuickTime/MP4 or Nikon-style sub-IFDs. Single-file mode shells out per call; batch mode pipes paths on stdin via `-@ -` and fans the result through one subprocess so the upcoming per-tick drain doesn't pay startup cost per row. The `exiftool` PATH check is cached in a `OnceLock` to keep the drain short-circuited on deploys without exiftool installed. `SubSecDateTimeOriginal` and `ContentCreateDate` are pulled alongside the standard tags to capture iPhone's sub-second precision and Apple's preferred capture-time tag respectively. `FileModifyDate` is deliberately *not* in the tag list — it's a filesystem-derived value the resolver already covers via the `fs_time` step, and pulling it through exiftool would mask "no real EXIF date" with a misleading `source = exiftool` row. Module is registered in both `lib.rs` and `main.rs` (sibling-module pattern the rest of the bin uses); no callers wired in yet — that lands in the next commit. Comes with 9 unit tests covering JSON parsing edge cases, source-priority short-circuiting, and the fs_time-when-no-exif path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
495
src/date_resolver.rs
Normal file
495
src/date_resolver.rs
Normal file
@@ -0,0 +1,495 @@
|
||||
//! Canonical `date_taken` resolution for ingest and the per-tick backfill
|
||||
//! drain.
|
||||
//!
|
||||
//! The waterfall (in order; first hit wins):
|
||||
//!
|
||||
//! 1. **kamadak-exif** — fast in-process EXIF read. Already done by
|
||||
//! `exif::extract_exif_from_path` for image-bearing formats; callers
|
||||
//! pass that result in via `prior_exif_date` so we don't re-parse.
|
||||
//! 2. **exiftool** — shell-out fallback that reaches places kamadak-exif
|
||||
//! can't: QuickTime/MP4 (`MediaCreateDate`, `TrackCreateDate`,
|
||||
//! `CreateDate`), Apple's `ContentCreateDate`, MakerNote sub-IFDs.
|
||||
//! Required for videos to land a real date; degrades silently when
|
||||
//! `exiftool` isn't on PATH.
|
||||
//! 3. **filename regex** — `memories::extract_date_from_filename` covers
|
||||
//! common screenshot / chat-export / timestamp-named patterns.
|
||||
//! 4. **earliest filesystem time** — `utils::earliest_fs_time` picks the
|
||||
//! earlier of created / modified, which on copied-from-backup files is
|
||||
//! a better proxy for content age than either alone.
|
||||
//!
|
||||
//! `DateSource` records which step won so the per-tick drain can re-resolve
|
||||
//! weak sources (`fs_time`) once exiftool becomes available, and so the
|
||||
//! UI/debug surface can answer "why does this photo show up under this
|
||||
//! date." Note that the previous `/memories` request-time logic preferred
|
||||
//! filename even when EXIF was present; this resolver inverts that — EXIF
|
||||
//! is authoritative when it exists, on the theory that an EXIF
|
||||
//! `DateTimeOriginal` is more reliable than a filename pattern that may
|
||||
//! reflect import time rather than capture time.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use log::{debug, trace, warn};
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::utils::earliest_fs_time;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum DateSource {
|
||||
/// kamadak-exif read DateTime/DateTimeOriginal directly.
|
||||
Exif,
|
||||
/// exiftool fallback caught a video / MakerNote / QuickTime tag.
|
||||
Exiftool,
|
||||
/// `extract_date_from_filename` matched a known pattern.
|
||||
Filename,
|
||||
/// Fell through to `earliest_fs_time(metadata)`.
|
||||
FsTime,
|
||||
}
|
||||
|
||||
impl DateSource {
|
||||
pub fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
DateSource::Exif => "exif",
|
||||
DateSource::Exiftool => "exiftool",
|
||||
DateSource::Filename => "filename",
|
||||
DateSource::FsTime => "fs_time",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct ResolvedDate {
|
||||
pub timestamp: i64,
|
||||
pub source: DateSource,
|
||||
}
|
||||
|
||||
/// Resolve the canonical date for a single file, given an already-extracted
|
||||
/// kamadak-exif date if available. Returns `None` only if every step in the
|
||||
/// waterfall fails — for files that exist on disk this should be vanishingly
|
||||
/// rare (the fs-time fallback alone almost always succeeds).
|
||||
pub fn resolve_date_taken(path: &Path, prior_exif_date: Option<i64>) -> Option<ResolvedDate> {
|
||||
if let Some(ts) = prior_exif_date {
|
||||
return Some(ResolvedDate {
|
||||
timestamp: ts,
|
||||
source: DateSource::Exif,
|
||||
});
|
||||
}
|
||||
if let Some(ts) = exiftool_date_single(path) {
|
||||
return Some(ResolvedDate {
|
||||
timestamp: ts,
|
||||
source: DateSource::Exiftool,
|
||||
});
|
||||
}
|
||||
if let Some(dt) = path
|
||||
.file_name()
|
||||
.and_then(|f| f.to_str())
|
||||
.and_then(crate::memories::extract_date_from_filename)
|
||||
{
|
||||
return Some(ResolvedDate {
|
||||
timestamp: dt.timestamp(),
|
||||
source: DateSource::Filename,
|
||||
});
|
||||
}
|
||||
if let Ok(meta) = std::fs::metadata(path)
|
||||
&& let Some(t) = earliest_fs_time(&meta)
|
||||
{
|
||||
let dt: DateTime<Utc> = t.into();
|
||||
return Some(ResolvedDate {
|
||||
timestamp: dt.timestamp(),
|
||||
source: DateSource::FsTime,
|
||||
});
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Batch waterfall. exiftool runs once over the whole batch (single
|
||||
/// subprocess); everything else is per-file and runs only on misses.
|
||||
/// `prior_exif_dates` lets the caller pass in already-known kamadak dates
|
||||
/// keyed by path; entries without a prior date fall through to exiftool
|
||||
/// and the rest of the waterfall.
|
||||
///
|
||||
/// The per-tick backfill drain is the primary caller — it loads ~500 rows
|
||||
/// at a time and uses one exiftool subprocess to drain the lot.
|
||||
pub fn resolve_dates_batch(
|
||||
paths: &[PathBuf],
|
||||
prior_exif_dates: &HashMap<PathBuf, i64>,
|
||||
) -> HashMap<PathBuf, ResolvedDate> {
|
||||
let mut out: HashMap<PathBuf, ResolvedDate> = HashMap::new();
|
||||
let mut needs_exiftool: Vec<&Path> = Vec::with_capacity(paths.len());
|
||||
|
||||
for path in paths {
|
||||
if let Some(&ts) = prior_exif_dates.get(path) {
|
||||
out.insert(
|
||||
path.clone(),
|
||||
ResolvedDate {
|
||||
timestamp: ts,
|
||||
source: DateSource::Exif,
|
||||
},
|
||||
);
|
||||
} else {
|
||||
needs_exiftool.push(path.as_path());
|
||||
}
|
||||
}
|
||||
|
||||
if !needs_exiftool.is_empty() {
|
||||
let exiftool_results = exiftool_dates_batch(&needs_exiftool);
|
||||
for path in &needs_exiftool {
|
||||
if let Some(&ts) = exiftool_results.get(*path) {
|
||||
out.insert(
|
||||
path.to_path_buf(),
|
||||
ResolvedDate {
|
||||
timestamp: ts,
|
||||
source: DateSource::Exiftool,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for path in paths {
|
||||
if out.contains_key(path) {
|
||||
continue;
|
||||
}
|
||||
if let Some(dt) = path
|
||||
.file_name()
|
||||
.and_then(|f| f.to_str())
|
||||
.and_then(crate::memories::extract_date_from_filename)
|
||||
{
|
||||
out.insert(
|
||||
path.clone(),
|
||||
ResolvedDate {
|
||||
timestamp: dt.timestamp(),
|
||||
source: DateSource::Filename,
|
||||
},
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if let Ok(meta) = std::fs::metadata(path)
|
||||
&& let Some(t) = earliest_fs_time(&meta)
|
||||
{
|
||||
let dt: DateTime<Utc> = t.into();
|
||||
out.insert(
|
||||
path.clone(),
|
||||
ResolvedDate {
|
||||
timestamp: dt.timestamp(),
|
||||
source: DateSource::FsTime,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
/// Tag priority for exiftool extraction. First non-zero value wins.
|
||||
///
|
||||
/// Photos: `DateTimeOriginal` (original capture) and `SubSecDateTimeOriginal`
|
||||
/// are most authoritative. `CreateDate` is a common alias and a sane fallback.
|
||||
///
|
||||
/// Videos: `MediaCreateDate` / `TrackCreateDate` are the QuickTime/MP4
|
||||
/// timestamps. `ContentCreateDate` is Apple's iOS-set tag; it often
|
||||
/// reflects local capture time on iPhone exports better than the others.
|
||||
///
|
||||
/// Notably absent: `FileModifyDate` / `FileAccessDate` — those are
|
||||
/// filesystem-derived and the resolver covers them via the `fs_time`
|
||||
/// fallback. Letting exiftool pull them here would mask "no real EXIF
|
||||
/// date" with a `source = exiftool` row that's no better than fs_time.
|
||||
const EXIFTOOL_DATE_TAGS: &[&str] = &[
|
||||
"DateTimeOriginal",
|
||||
"SubSecDateTimeOriginal",
|
||||
"CreateDate",
|
||||
"MediaCreateDate",
|
||||
"TrackCreateDate",
|
||||
"ContentCreateDate",
|
||||
];
|
||||
|
||||
/// Cache the "exiftool exists on PATH" check across the process lifetime so
|
||||
/// the per-tick backfill doesn't fork a doomed subprocess every iteration on
|
||||
/// deploys without exiftool installed.
|
||||
fn exiftool_available() -> bool {
|
||||
static AVAIL: OnceLock<bool> = OnceLock::new();
|
||||
*AVAIL.get_or_init(|| {
|
||||
let ok = Command::new("exiftool")
|
||||
.arg("-ver")
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.status()
|
||||
.map(|s| s.success())
|
||||
.unwrap_or(false);
|
||||
if !ok {
|
||||
warn!("exiftool not on PATH; date_taken waterfall skips that step");
|
||||
}
|
||||
ok
|
||||
})
|
||||
}
|
||||
|
||||
/// One-file exiftool invocation. Used by the upload + GPS-write paths,
|
||||
/// which deal with one file at a time. The batch path uses
|
||||
/// `exiftool_dates_batch` so we don't pay subprocess startup per row.
|
||||
fn exiftool_date_single(path: &Path) -> Option<i64> {
|
||||
if !exiftool_available() {
|
||||
return None;
|
||||
}
|
||||
let mut cmd = Command::new("exiftool");
|
||||
cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2");
|
||||
for tag in EXIFTOOL_DATE_TAGS {
|
||||
cmd.arg(format!("-{}", tag));
|
||||
}
|
||||
cmd.arg(path);
|
||||
let output = cmd.output().ok()?;
|
||||
if !output.status.success() {
|
||||
trace!("exiftool exited non-zero for {:?}", path);
|
||||
return None;
|
||||
}
|
||||
parse_exiftool_json(&output.stdout)
|
||||
.into_iter()
|
||||
.next()
|
||||
.map(|(_, ts)| ts)
|
||||
}
|
||||
|
||||
/// Drain a batch via a single exiftool subprocess. Paths are fed on stdin
|
||||
/// via `-@ -`, so the argv stays short regardless of batch size — safe for
|
||||
/// libraries with very long path components.
|
||||
fn exiftool_dates_batch(paths: &[&Path]) -> HashMap<PathBuf, i64> {
|
||||
let mut out = HashMap::new();
|
||||
if paths.is_empty() || !exiftool_available() {
|
||||
return out;
|
||||
}
|
||||
|
||||
let mut cmd = Command::new("exiftool");
|
||||
cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2");
|
||||
for tag in EXIFTOOL_DATE_TAGS {
|
||||
cmd.arg(format!("-{}", tag));
|
||||
}
|
||||
cmd.arg("-@").arg("-");
|
||||
cmd.stdin(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::null());
|
||||
|
||||
let mut child = match cmd.spawn() {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
warn!("exiftool batch spawn failed: {}", e);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(mut stdin) = child.stdin.take() {
|
||||
for p in paths {
|
||||
// exiftool's argfile reader treats each line as one path; OS
|
||||
// path bytes don't always survive a String round-trip, but
|
||||
// every path we get here originated from rel_path / root_path
|
||||
// strings already, so to-string-lossy is a non-event.
|
||||
if let Err(e) = writeln!(stdin, "{}", p.display()) {
|
||||
warn!("exiftool batch stdin write failed: {}", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let output = match child.wait_with_output() {
|
||||
Ok(o) => o,
|
||||
Err(e) => {
|
||||
warn!("exiftool batch wait failed: {}", e);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
if !output.status.success() {
|
||||
debug!(
|
||||
"exiftool batch exit status {:?}; partial output may still parse",
|
||||
output.status.code()
|
||||
);
|
||||
}
|
||||
for (source, ts) in parse_exiftool_json(&output.stdout) {
|
||||
out.insert(PathBuf::from(source), ts);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// One row per input file. exiftool emits any tag we asked for that was
|
||||
/// present, plus the `SourceFile` it was reading. Tags are JSON values
|
||||
/// because `-d %s` returns the timestamp as a *string* of digits, not a
|
||||
/// number, when the date parses; absent tags are simply missing keys.
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ExiftoolEntry {
|
||||
#[serde(rename = "SourceFile")]
|
||||
source_file: String,
|
||||
#[serde(rename = "DateTimeOriginal")]
|
||||
date_time_original: Option<serde_json::Value>,
|
||||
#[serde(rename = "SubSecDateTimeOriginal")]
|
||||
sub_sec_date_time_original: Option<serde_json::Value>,
|
||||
#[serde(rename = "CreateDate")]
|
||||
create_date: Option<serde_json::Value>,
|
||||
#[serde(rename = "MediaCreateDate")]
|
||||
media_create_date: Option<serde_json::Value>,
|
||||
#[serde(rename = "TrackCreateDate")]
|
||||
track_create_date: Option<serde_json::Value>,
|
||||
#[serde(rename = "ContentCreateDate")]
|
||||
content_create_date: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
fn parse_exiftool_json(stdout: &[u8]) -> Vec<(String, i64)> {
|
||||
let entries: Vec<ExiftoolEntry> = match serde_json::from_slice(stdout) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
// Empty stdout on total failure isn't a parse error worth
|
||||
// logging at warn — the caller already noted the non-zero
|
||||
// exit status.
|
||||
if !stdout.is_empty() {
|
||||
warn!("exiftool JSON parse failed: {}", e);
|
||||
}
|
||||
return Vec::new();
|
||||
}
|
||||
};
|
||||
|
||||
let mut out = Vec::with_capacity(entries.len());
|
||||
for entry in entries {
|
||||
// Walk the priority list. exiftool sometimes returns the literal
|
||||
// string "0000:00:00 00:00:00" for missing-but-allocated date
|
||||
// slots; with `-d %s` that becomes the unix epoch (0). Reject
|
||||
// anything <= 0 so we fall through to the next tag.
|
||||
let tags = [
|
||||
entry.date_time_original.as_ref(),
|
||||
entry.sub_sec_date_time_original.as_ref(),
|
||||
entry.create_date.as_ref(),
|
||||
entry.media_create_date.as_ref(),
|
||||
entry.track_create_date.as_ref(),
|
||||
entry.content_create_date.as_ref(),
|
||||
];
|
||||
let mut chosen: Option<i64> = None;
|
||||
for tag in tags.iter().flatten() {
|
||||
if let Some(ts) = coerce_to_unix_seconds(tag)
|
||||
&& ts > 0
|
||||
{
|
||||
chosen = Some(ts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if let Some(ts) = chosen {
|
||||
out.push((entry.source_file, ts));
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// `-d %s` should hand us a numeric string, but exiftool's JSON encoder
|
||||
/// will emit a number when the tag was defined as numeric in its lib —
|
||||
/// accept both shapes.
|
||||
fn coerce_to_unix_seconds(v: &serde_json::Value) -> Option<i64> {
|
||||
match v {
|
||||
serde_json::Value::String(s) => s.trim().parse::<i64>().ok(),
|
||||
serde_json::Value::Number(n) => n.as_i64(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_picks_first_priority_tag() {
|
||||
let json = br#"[{
|
||||
"SourceFile": "/lib/IMG.jpg",
|
||||
"DateTimeOriginal": "1500000000",
|
||||
"CreateDate": "1400000000"
|
||||
}]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert_eq!(parsed, vec![("/lib/IMG.jpg".to_string(), 1500000000)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_falls_through_zeros() {
|
||||
// exiftool emits "0000:00:00 00:00:00" → unix epoch 0 with -d %s.
|
||||
// The resolver should skip those and pick the next tag.
|
||||
let json = br#"[{
|
||||
"SourceFile": "/lib/clip.mov",
|
||||
"DateTimeOriginal": "0",
|
||||
"MediaCreateDate": "1500000000"
|
||||
}]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert_eq!(parsed, vec![("/lib/clip.mov".to_string(), 1500000000)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_accepts_numeric_values() {
|
||||
let json = br#"[{
|
||||
"SourceFile": "/lib/a.jpg",
|
||||
"CreateDate": 1234567890
|
||||
}]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert_eq!(parsed, vec![("/lib/a.jpg".to_string(), 1234567890)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_emits_nothing_when_no_tag_present() {
|
||||
let json = br#"[{"SourceFile": "/lib/no_dates.bin"}]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert!(parsed.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_handles_multiple_entries() {
|
||||
let json = br#"[
|
||||
{"SourceFile": "/lib/a.jpg", "DateTimeOriginal": "100"},
|
||||
{"SourceFile": "/lib/b.jpg", "CreateDate": "200"}
|
||||
]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert_eq!(
|
||||
parsed,
|
||||
vec![
|
||||
("/lib/a.jpg".to_string(), 100),
|
||||
("/lib/b.jpg".to_string(), 200)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn date_source_as_str_round_trip() {
|
||||
for src in [
|
||||
DateSource::Exif,
|
||||
DateSource::Exiftool,
|
||||
DateSource::Filename,
|
||||
DateSource::FsTime,
|
||||
] {
|
||||
assert!(!src.as_str().is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_uses_prior_exif_when_present() {
|
||||
// Path doesn't need to exist when prior_exif_date short-circuits.
|
||||
let resolved =
|
||||
resolve_date_taken(Path::new("/nonexistent/file.jpg"), Some(1700000000)).unwrap();
|
||||
assert_eq!(resolved.timestamp, 1700000000);
|
||||
assert_eq!(resolved.source, DateSource::Exif);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_filename_when_no_exif_and_file_missing() {
|
||||
// No prior EXIF, no exiftool match (file missing), but the filename
|
||||
// pattern still matches so the resolver lands on Filename.
|
||||
let resolved = resolve_date_taken(
|
||||
Path::new("/nonexistent/Screenshot_2014-06-01-20-44-50.png"),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(resolved.source, DateSource::Filename);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_fs_time_when_only_metadata_available() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("plain.jpg");
|
||||
std::fs::File::create(&path).unwrap();
|
||||
let resolved = resolve_date_taken(&path, None).unwrap();
|
||||
// exiftool may or may not be installed in the test env; either
|
||||
// way the file has no EXIF and no filename date, so we should
|
||||
// fall to fs_time.
|
||||
assert_eq!(resolved.source, DateSource::FsTime);
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ pub mod cleanup;
|
||||
pub mod content_hash;
|
||||
pub mod data;
|
||||
pub mod database;
|
||||
pub mod date_resolver;
|
||||
pub mod duplicates;
|
||||
pub mod error;
|
||||
pub mod exif;
|
||||
|
||||
@@ -64,6 +64,7 @@ mod auth;
|
||||
mod content_hash;
|
||||
mod data;
|
||||
mod database;
|
||||
mod date_resolver;
|
||||
mod duplicates;
|
||||
mod error;
|
||||
mod exif;
|
||||
|
||||
Reference in New Issue
Block a user