Merge pull request 'feature/canonical-date-taken' (#76) from feature/canonical-date-taken into master
Reviewed-on: #76
This commit was merged in pull request #76.
This commit is contained in:
48
CLAUDE.md
48
CLAUDE.md
@@ -364,6 +364,53 @@ Runs in background thread with two-tier strategy:
|
||||
- Batch queries EXIF DB to detect new files
|
||||
- Configurable via `WATCH_QUICK_INTERVAL_SECONDS` and `WATCH_FULL_INTERVAL_SECONDS`
|
||||
|
||||
**Canonical date_taken pipeline (`src/date_resolver.rs`).** Every row's
|
||||
`image_exif.date_taken` is populated at ingest by a four-step waterfall;
|
||||
which step won is recorded in `image_exif.date_taken_source` so the
|
||||
per-tick drain can re-resolve weak entries when better tools become
|
||||
available, and so the UI/debug surface can answer "why did this photo
|
||||
land on this date?". Order:
|
||||
|
||||
1. **`exif`** — kamadak-exif `DateTime` / `DateTimeOriginal`. Fast,
|
||||
in-process, image-only.
|
||||
2. **`exiftool`** — shell-out fallback for tags kamadak can't reach:
|
||||
QuickTime/MP4 (`MediaCreateDate`, `TrackCreateDate`, `CreateDate`),
|
||||
Apple's `ContentCreateDate`, MakerNote sub-IFDs. Required for
|
||||
videos to land a real date. Single-file at ingest; the per-tick
|
||||
drain feeds the whole batch through one `exiftool -@ -` subprocess.
|
||||
Degrades silently when `exiftool` isn't on PATH (resolver caches the
|
||||
"available" check via `OnceLock`).
|
||||
3. **`filename`** — `extract_date_from_filename` in `memories.rs`
|
||||
matches screenshot, chat-export, and timestamp-named patterns.
|
||||
4. **`fs_time`** — `earliest_fs_time(metadata)` (earlier of created /
|
||||
modified). Last resort.
|
||||
|
||||
Notable behavior change vs. the pre-2026-05 request-time logic:
|
||||
**EXIF beats filename when both are present.** A photo named
|
||||
`Screenshot_2014-06-01.png` whose EXIF `DateTime` is 2021 now appears
|
||||
under 2021, not 2014 — on the theory that EXIF is more reliable than
|
||||
import-named filenames. The reverse case (no EXIF, filename has a
|
||||
date) is unchanged.
|
||||
|
||||
The `backfill_missing_date_taken` drain (`src/main.rs`) runs every
|
||||
watcher tick alongside `backfill_unhashed_backlog`. It loads up to
|
||||
`DATE_BACKFILL_MAX_PER_TICK` rows (default 500) where
|
||||
`date_taken IS NULL OR date_taken_source = 'fs_time'` (backed by the
|
||||
`idx_image_exif_date_backfill` partial index), runs the waterfall
|
||||
batch via `resolve_dates_batch`, and writes results via the
|
||||
`backfill_date_taken` DAO method (touches only `date_taken` +
|
||||
`date_taken_source` so EXIF / hash / perceptual columns are
|
||||
preserved). `filename`-sourced rows are intentionally not re-resolved
|
||||
— the regex is authoritative when it matches, and re-running exiftool
|
||||
won't change the answer.
|
||||
|
||||
`/memories` is a single SQL query against this column
|
||||
(`get_memories_in_window` in `src/database/mod.rs`), using
|
||||
`strftime('%m-%d' | '%W' | '%m', date_taken, 'unixepoch', tz)` for
|
||||
calendar matching with the client's timezone offset. The pre-rewrite
|
||||
version stat'd every row and walked the entire library tree — at
|
||||
~14k photos this took 10–15 s; the rewrite is single-digit ms.
|
||||
|
||||
**EXIF Extraction:**
|
||||
- Uses `kamadak-exif` crate
|
||||
- Supports: JPEG, TIFF, RAW (NEF, CR2, CR3), HEIF/HEIC, PNG, WebP
|
||||
@@ -534,6 +581,7 @@ Optional:
|
||||
```bash
|
||||
WATCH_QUICK_INTERVAL_SECONDS=60 # Quick scan interval
|
||||
WATCH_FULL_INTERVAL_SECONDS=3600 # Full scan interval
|
||||
DATE_BACKFILL_MAX_PER_TICK=500 # Cap on canonical-date drain per watcher tick
|
||||
OTLP_OTLS_ENDPOINT=http://... # OpenTelemetry collector (release builds)
|
||||
|
||||
# AI Insights Configuration
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
DROP INDEX IF EXISTS idx_image_exif_date_backfill;
|
||||
ALTER TABLE image_exif DROP COLUMN date_taken_source;
|
||||
24
migrations/2026-05-06-000000_add_date_taken_source/up.sql
Normal file
24
migrations/2026-05-06-000000_add_date_taken_source/up.sql
Normal file
@@ -0,0 +1,24 @@
|
||||
-- Tracks where a row's `date_taken` was sourced so the canonical-date
|
||||
-- waterfall (kamadak-exif → exiftool → filename → earliest_fs_time) is
|
||||
-- visible to debugging and to the per-tick backfill drain that re-runs
|
||||
-- weak sources once stronger ones become available (e.g. exiftool gets
|
||||
-- installed on a deploy that didn't have it). See CLAUDE.md → Memories
|
||||
-- canonical-date pipeline.
|
||||
--
|
||||
-- Values:
|
||||
-- 'exif' — kamadak-exif read DateTime/DateTimeOriginal directly
|
||||
-- 'exiftool' — exiftool fallback caught a video / MakerNote / QuickTime tag
|
||||
-- 'filename' — extract_date_from_filename matched a known pattern
|
||||
-- 'fs_time' — fell through to earliest_fs_time(metadata)
|
||||
--
|
||||
-- NULL when `date_taken` itself is NULL (no source resolved the date).
|
||||
ALTER TABLE image_exif ADD COLUMN date_taken_source TEXT;
|
||||
|
||||
-- Partial index for the per-tick backfill drain: targets rows that need
|
||||
-- re-resolution (no date yet, or only the weakest source resolved it).
|
||||
-- Filename-sourced rows are intentionally excluded — the regex is
|
||||
-- authoritative when it matches and re-running exiftool wouldn't change
|
||||
-- the answer.
|
||||
CREATE INDEX idx_image_exif_date_backfill
|
||||
ON image_exif (library_id, id)
|
||||
WHERE date_taken IS NULL OR date_taken_source = 'fs_time';
|
||||
@@ -0,0 +1,9 @@
|
||||
-- Reverting this migration is a no-op: the labels we wrote in `up.sql`
|
||||
-- are correct under any state of the schema (every dated row was indeed
|
||||
-- exif-sourced before the resolver landed), and there's no signal that
|
||||
-- distinguishes "labelled by this migration" from "labelled by the
|
||||
-- ingest path post-resolver". Clearing them would break the drain's
|
||||
-- eligibility filter again.
|
||||
--
|
||||
-- The companion migration `2026-05-06-000000_add_date_taken_source` is
|
||||
-- the one to revert if you need to remove the column entirely.
|
||||
@@ -0,0 +1,20 @@
|
||||
-- Backfill `date_taken_source` for rows that pre-date the canonical-date
|
||||
-- pipeline. Before the resolver landed, `image_exif.date_taken` could
|
||||
-- only be populated via `exif::extract_exif_from_path` (kamadak-exif)
|
||||
-- on the file-watcher, upload, or GPS-write paths. The resolver column
|
||||
-- migration added `date_taken_source` defaulting to NULL, so every
|
||||
-- historical row with a date is currently unlabelled — and the
|
||||
-- per-tick drain skips them because its eligibility predicate is
|
||||
-- `date_taken IS NULL OR date_taken_source = 'fs_time'`.
|
||||
--
|
||||
-- Label them `'exif'` once and let the drain take over from here. Safe
|
||||
-- because every code path that wrote `date_taken` prior to the
|
||||
-- resolver was a kamadak-exif read — there was no other source.
|
||||
--
|
||||
-- Idempotent: re-running this migration on a DB that has already been
|
||||
-- backfilled is a no-op (the WHERE clause matches nothing the second
|
||||
-- time around).
|
||||
UPDATE image_exif
|
||||
SET date_taken_source = 'exif'
|
||||
WHERE date_taken IS NOT NULL
|
||||
AND date_taken_source IS NULL;
|
||||
@@ -9,6 +9,21 @@ use crate::database::models::{
|
||||
};
|
||||
use crate::otel::trace_db_call;
|
||||
|
||||
/// Decoded shape for `get_memories_in_window`'s raw `sql_query`. Diesel's
|
||||
/// query DSL doesn't expose strftime, so the memories filter is hand-
|
||||
/// written SQL — but the returned columns are simple enough that a small
|
||||
/// `QueryableByName` struct suffices, kept private to this module.
|
||||
#[derive(diesel::QueryableByName)]
|
||||
#[allow(dead_code)] // fields read via Diesel's QueryableByName derive
|
||||
struct MemoriesWindowRow {
|
||||
#[diesel(sql_type = diesel::sql_types::Text)]
|
||||
rel_path: String,
|
||||
#[diesel(sql_type = diesel::sql_types::BigInt)]
|
||||
date_taken: i64,
|
||||
#[diesel(sql_type = diesel::sql_types::BigInt)]
|
||||
last_modified: i64,
|
||||
}
|
||||
|
||||
/// Wire shape for a single member of a duplicate group, returned by
|
||||
/// `list_duplicates_*` and `lookup_duplicate_row`. Carries everything
|
||||
/// the Apollo modal needs to render a member tile and its meta line —
|
||||
@@ -396,6 +411,63 @@ pub trait ExifDao: Sync + Send {
|
||||
size_bytes: i64,
|
||||
) -> Result<(), DbError>;
|
||||
|
||||
/// Return image_exif rows that need their `date_taken` re-resolved by
|
||||
/// the canonical-date waterfall (see `crate::date_resolver`):
|
||||
/// either no source ever ran (`date_taken IS NULL`), or only the
|
||||
/// weakest fallback resolved it (`date_taken_source = 'fs_time'`).
|
||||
/// Returns `(library_id, rel_path)`. The caller filters to its own
|
||||
/// library on the way through; rows from other libraries fall to the
|
||||
/// next library's tick. Backed by the partial index
|
||||
/// `idx_image_exif_date_backfill`.
|
||||
fn get_rows_needing_date_backfill(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
limit: i64,
|
||||
) -> Result<Vec<(i32, String)>, DbError>;
|
||||
|
||||
/// Persist a resolver result for an existing row. Touches `date_taken`
|
||||
/// and `date_taken_source` only — leaves all other columns alone so
|
||||
/// the drain doesn't accidentally clobber EXIF/hash/perceptual data
|
||||
/// the watcher / GPS-write path may have already written.
|
||||
fn backfill_date_taken(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
rel_path: &str,
|
||||
date_taken: i64,
|
||||
source: &str,
|
||||
) -> Result<(), DbError>;
|
||||
|
||||
/// Single-query backend for `/memories`. Returns
|
||||
/// `(rel_path, date_taken, last_modified)` for rows in `library_id`
|
||||
/// whose `date_taken` falls within `[now - years_back y, now]` and
|
||||
/// whose calendar position matches the request's span:
|
||||
/// - `"day"` — same month + day-of-month (any year)
|
||||
/// - `"week"` — same week-of-year (SQLite `%W`, Monday-anchored —
|
||||
/// close to but not exactly ISO week 8601; the
|
||||
/// boundary cases at year-start/end can shift by ±1
|
||||
/// vs the prior request-time `iso_week()` filter)
|
||||
/// - `"month"` — same month (any year)
|
||||
///
|
||||
/// `tz_offset_minutes` is applied to both sides of the strftime
|
||||
/// comparison so the calendar match is in the user's local time.
|
||||
/// Backed by the `(library_id, date_taken)` index.
|
||||
///
|
||||
/// This is the single-SQL replacement for the EXIF-loop +
|
||||
/// WalkDir-fallback that powered `/memories` previously; it's
|
||||
/// correct only because the canonical-date waterfall at ingest
|
||||
/// (`crate::date_resolver`) populates `date_taken` for every row
|
||||
/// it can resolve.
|
||||
fn get_memories_in_window(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
span_token: &str,
|
||||
years_back: i32,
|
||||
tz_offset_minutes: i32,
|
||||
) -> Result<Vec<(String, i64, i64)>, DbError>;
|
||||
|
||||
/// Return image rows that have a `content_hash` but no `phash_64`,
|
||||
/// oldest first. Used by the `backfill_perceptual_hash` binary.
|
||||
/// Filters by image extension at the DB layer to avoid ever asking
|
||||
@@ -730,6 +802,7 @@ impl ExifDao for SqliteExifDao {
|
||||
shutter_speed.eq(&exif_data.shutter_speed),
|
||||
iso.eq(&exif_data.iso),
|
||||
date_taken.eq(&exif_data.date_taken),
|
||||
date_taken_source.eq(&exif_data.date_taken_source),
|
||||
last_modified.eq(&exif_data.last_modified),
|
||||
))
|
||||
.execute(connection.deref_mut())
|
||||
@@ -1055,6 +1128,117 @@ impl ExifDao for SqliteExifDao {
|
||||
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
|
||||
}
|
||||
|
||||
fn get_rows_needing_date_backfill(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id_val: i32,
|
||||
limit: i64,
|
||||
) -> Result<Vec<(i32, String)>, DbError> {
|
||||
trace_db_call(
|
||||
context,
|
||||
"query",
|
||||
"get_rows_needing_date_backfill",
|
||||
|_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
// The partial index is on `(library_id, id) WHERE date_taken
|
||||
// IS NULL OR date_taken_source = 'fs_time'`, so the planner
|
||||
// hits it directly when both predicates are present.
|
||||
image_exif
|
||||
.filter(library_id.eq(library_id_val))
|
||||
.filter(date_taken.is_null().or(date_taken_source.eq("fs_time")))
|
||||
.select((library_id, rel_path))
|
||||
.order(id.asc())
|
||||
.limit(limit)
|
||||
.load::<(i32, String)>(connection.deref_mut())
|
||||
.map_err(|_| anyhow::anyhow!("Query error"))
|
||||
},
|
||||
)
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn backfill_date_taken(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id_val: i32,
|
||||
rel_path_val: &str,
|
||||
date_taken_val: i64,
|
||||
source: &str,
|
||||
) -> Result<(), DbError> {
|
||||
trace_db_call(context, "update", "backfill_date_taken", |_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
diesel::update(
|
||||
image_exif
|
||||
.filter(library_id.eq(library_id_val))
|
||||
.filter(rel_path.eq(rel_path_val)),
|
||||
)
|
||||
.set((date_taken.eq(date_taken_val), date_taken_source.eq(source)))
|
||||
.execute(connection.deref_mut())
|
||||
.map(|_| ())
|
||||
.map_err(|_| anyhow::anyhow!("Update error"))
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
|
||||
}
|
||||
|
||||
fn get_memories_in_window(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
span_token: &str,
|
||||
years_back: i32,
|
||||
tz_offset_minutes: i32,
|
||||
) -> Result<Vec<(String, i64, i64)>, DbError> {
|
||||
trace_db_call(context, "query", "get_memories_in_window", |_span| {
|
||||
// strftime pattern is span-dependent; the rest of the WHERE
|
||||
// clause is shared. Only `%m-%d`, `%W`, `%m` are accepted —
|
||||
// anything else is a programmer error.
|
||||
let pattern = match span_token {
|
||||
"day" => "%m-%d",
|
||||
"week" => "%W",
|
||||
"month" => "%m",
|
||||
_ => return Err(anyhow::anyhow!("invalid span token: {}", span_token)),
|
||||
};
|
||||
|
||||
// SQLite's date modifiers want a string like `'-480 minutes'`
|
||||
// (signed) or `'-15 years'`. Use the `+` flag so positive
|
||||
// offsets render as `+480 minutes`.
|
||||
let tz_modifier = format!("{:+} minutes", tz_offset_minutes);
|
||||
let years_modifier = format!("-{} years", years_back);
|
||||
|
||||
let sql = format!(
|
||||
"SELECT rel_path, date_taken, last_modified \
|
||||
FROM image_exif \
|
||||
WHERE library_id = ?1 \
|
||||
AND date_taken IS NOT NULL \
|
||||
AND date_taken <= unixepoch('now') \
|
||||
AND date_taken >= unixepoch('now', ?2) \
|
||||
AND strftime('{p}', date_taken, 'unixepoch', ?3) \
|
||||
= strftime('{p}', 'now', ?3)",
|
||||
p = pattern,
|
||||
);
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
diesel::sql_query(sql)
|
||||
.bind::<diesel::sql_types::Integer, _>(library_id)
|
||||
.bind::<diesel::sql_types::Text, _>(years_modifier)
|
||||
.bind::<diesel::sql_types::Text, _>(tz_modifier)
|
||||
.load::<MemoriesWindowRow>(connection.deref_mut())
|
||||
.map(|rows| {
|
||||
rows.into_iter()
|
||||
.map(|r| (r.rel_path, r.date_taken, r.last_modified))
|
||||
.collect()
|
||||
})
|
||||
.map_err(|e| anyhow::anyhow!("Query error: {}", e))
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn find_by_content_hash(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
@@ -1819,6 +2003,7 @@ mod exif_dao_tests {
|
||||
size_bytes: None,
|
||||
phash_64: None,
|
||||
dhash_64: None,
|
||||
date_taken_source: None,
|
||||
},
|
||||
)
|
||||
.expect("insert exif row");
|
||||
@@ -1931,4 +2116,205 @@ mod exif_dao_tests {
|
||||
// Unknown library: zero, no error.
|
||||
assert_eq!(dao.count_for_library(&ctx(), 999).unwrap(), 0);
|
||||
}
|
||||
|
||||
/// Insert a row with an explicit date source — used by the
|
||||
/// canonical-date drain tests below.
|
||||
fn insert_row_with_source(
|
||||
dao: &mut SqliteExifDao,
|
||||
lib_id: i32,
|
||||
rel: &str,
|
||||
date: Option<i64>,
|
||||
source: Option<&str>,
|
||||
) {
|
||||
dao.store_exif(
|
||||
&ctx(),
|
||||
InsertImageExif {
|
||||
library_id: lib_id,
|
||||
file_path: rel.to_string(),
|
||||
camera_make: None,
|
||||
camera_model: None,
|
||||
lens_model: None,
|
||||
width: None,
|
||||
height: None,
|
||||
orientation: None,
|
||||
gps_latitude: None,
|
||||
gps_longitude: None,
|
||||
gps_altitude: None,
|
||||
focal_length: None,
|
||||
aperture: None,
|
||||
shutter_speed: None,
|
||||
iso: None,
|
||||
date_taken: date,
|
||||
created_time: 0,
|
||||
last_modified: 0,
|
||||
content_hash: None,
|
||||
size_bytes: None,
|
||||
phash_64: None,
|
||||
dhash_64: None,
|
||||
date_taken_source: source.map(|s| s.to_string()),
|
||||
},
|
||||
)
|
||||
.expect("insert exif row");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_rows_needing_date_backfill_returns_null_and_fs_time() {
|
||||
let mut dao = setup_two_libraries();
|
||||
// Each row exercises a different source: null, fs_time (eligible),
|
||||
// filename and exif (skipped).
|
||||
insert_row_with_source(&mut dao, 1, "main/null.jpg", None, None);
|
||||
insert_row_with_source(&mut dao, 1, "main/fs.jpg", Some(123), Some("fs_time"));
|
||||
insert_row_with_source(&mut dao, 1, "main/name.jpg", Some(456), Some("filename"));
|
||||
insert_row_with_source(&mut dao, 1, "main/real.jpg", Some(789), Some("exif"));
|
||||
// Other library — never returned even when eligible.
|
||||
insert_row_with_source(&mut dao, 2, "archive/null.jpg", None, None);
|
||||
|
||||
let rows = dao.get_rows_needing_date_backfill(&ctx(), 1, 100).unwrap();
|
||||
let paths: Vec<String> = rows.into_iter().map(|(_, p)| p).collect();
|
||||
assert_eq!(paths.len(), 2, "expected null + fs_time eligible only");
|
||||
assert!(paths.contains(&"main/null.jpg".to_string()));
|
||||
assert!(paths.contains(&"main/fs.jpg".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backfill_date_taken_writes_date_and_source_only() {
|
||||
let mut dao = setup_two_libraries();
|
||||
insert_row_with_source(&mut dao, 1, "main/x.jpg", None, None);
|
||||
// Set a content_hash on the row to verify backfill_date_taken
|
||||
// doesn't disturb other columns. Using the existing
|
||||
// backfill_content_hash for this verifies via a separate path.
|
||||
dao.backfill_content_hash(&ctx(), 1, "main/x.jpg", "deadbeef", 1024)
|
||||
.unwrap();
|
||||
|
||||
dao.backfill_date_taken(&ctx(), 1, "main/x.jpg", 1700000000, "exiftool")
|
||||
.unwrap();
|
||||
|
||||
let row = dao.get_exif(&ctx(), "main/x.jpg").unwrap().unwrap();
|
||||
assert_eq!(row.date_taken, Some(1700000000));
|
||||
assert_eq!(row.date_taken_source, Some("exiftool".to_string()));
|
||||
// Untouched columns survive.
|
||||
assert_eq!(row.content_hash, Some("deadbeef".to_string()));
|
||||
assert_eq!(row.size_bytes, Some(1024));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_memories_in_window_day_matches_only_same_md_in_year_window() {
|
||||
let mut dao = setup_two_libraries();
|
||||
|
||||
// Anchor on a known date so the test is timezone-stable: insert
|
||||
// rows whose date_taken IS the same wall-clock time as `now()`
|
||||
// would have been some N years ago, and verify the day-span
|
||||
// filter returns them. We can't bind 'now' from Rust, so instead
|
||||
// we insert rows for the *current* day (offset by 365 days * N
|
||||
// years) and rely on SQLite computing the same `%m-%d` for both
|
||||
// sides of the equality. Using the unix-now-minus-365*N seconds
|
||||
// approximation is good enough — leap years drift by ~one day
|
||||
// every four years, but the test only checks day-of-year match
|
||||
// for rows inserted "today minus N years (no leap correction)".
|
||||
// To dodge the leap-year drift entirely, we use rows whose
|
||||
// calendar date is read back from SQLite and we just check
|
||||
// membership.
|
||||
|
||||
// 1y, 5y, 10y, 21y back from 'now':
|
||||
let now_ts = chrono::Utc::now().timestamp();
|
||||
let year_secs: i64 = 365 * 86_400;
|
||||
insert_row_with_source(
|
||||
&mut dao,
|
||||
1,
|
||||
"y1.jpg",
|
||||
Some(now_ts - year_secs),
|
||||
Some("exif"),
|
||||
);
|
||||
insert_row_with_source(
|
||||
&mut dao,
|
||||
1,
|
||||
"y5.jpg",
|
||||
Some(now_ts - 5 * year_secs),
|
||||
Some("exif"),
|
||||
);
|
||||
insert_row_with_source(
|
||||
&mut dao,
|
||||
1,
|
||||
"y10.jpg",
|
||||
Some(now_ts - 10 * year_secs),
|
||||
Some("exif"),
|
||||
);
|
||||
// Outside the 20-year window:
|
||||
insert_row_with_source(
|
||||
&mut dao,
|
||||
1,
|
||||
"y21.jpg",
|
||||
Some(now_ts - 21 * year_secs),
|
||||
Some("exif"),
|
||||
);
|
||||
// Future row: must be excluded by the `<= now` clause.
|
||||
insert_row_with_source(
|
||||
&mut dao,
|
||||
1,
|
||||
"future.jpg",
|
||||
Some(now_ts + 86_400),
|
||||
Some("exif"),
|
||||
);
|
||||
// No date — never returned regardless of source.
|
||||
insert_row_with_source(&mut dao, 1, "nodate.jpg", None, None);
|
||||
|
||||
// Month span returns rows from the same calendar month over the
|
||||
// window — y1, y5, y10 should all qualify (same month any year),
|
||||
// y21 trims (out of years_back), future trims (> now), nodate
|
||||
// never qualifies. Day-of-month leap drift means even with 365-
|
||||
// day approximation a row may shift by one in either direction;
|
||||
// month is the safer assertion under that approximation.
|
||||
let rows = dao
|
||||
.get_memories_in_window(&ctx(), 1, "month", 20, 0)
|
||||
.unwrap();
|
||||
let paths: std::collections::HashSet<String> =
|
||||
rows.into_iter().map(|(p, _, _)| p).collect();
|
||||
assert!(
|
||||
paths.contains("y1.jpg") && paths.contains("y5.jpg") && paths.contains("y10.jpg"),
|
||||
"month span should include all in-window rows: {:?}",
|
||||
paths
|
||||
);
|
||||
assert!(
|
||||
!paths.contains("y21.jpg"),
|
||||
"21-year-old row should fall outside the years_back window"
|
||||
);
|
||||
assert!(!paths.contains("future.jpg"), "future row must be excluded");
|
||||
assert!(
|
||||
!paths.contains("nodate.jpg"),
|
||||
"row without date must never appear"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_memories_in_window_scopes_by_library_id() {
|
||||
let mut dao = setup_two_libraries();
|
||||
let now_ts = chrono::Utc::now().timestamp();
|
||||
let year = 365 * 86_400i64;
|
||||
insert_row_with_source(&mut dao, 1, "main/x.jpg", Some(now_ts - year), Some("exif"));
|
||||
insert_row_with_source(
|
||||
&mut dao,
|
||||
2,
|
||||
"archive/x.jpg",
|
||||
Some(now_ts - year),
|
||||
Some("exif"),
|
||||
);
|
||||
|
||||
let lib1 = dao
|
||||
.get_memories_in_window(&ctx(), 1, "month", 20, 0)
|
||||
.unwrap();
|
||||
let lib2 = dao
|
||||
.get_memories_in_window(&ctx(), 2, "month", 20, 0)
|
||||
.unwrap();
|
||||
assert_eq!(lib1.len(), 1);
|
||||
assert_eq!(lib1[0].0, "main/x.jpg");
|
||||
assert_eq!(lib2.len(), 1);
|
||||
assert_eq!(lib2[0].0, "archive/x.jpg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_memories_in_window_rejects_unknown_span_token() {
|
||||
let mut dao = setup_two_libraries();
|
||||
let err = dao.get_memories_in_window(&ctx(), 1, "decade", 20, 0);
|
||||
assert!(err.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,6 +63,12 @@ pub struct InsertImageExif {
|
||||
pub phash_64: Option<i64>,
|
||||
/// 64-bit dHash (gradient). NULL for videos and decode failures.
|
||||
pub dhash_64: Option<i64>,
|
||||
/// Which step of the canonical-date waterfall populated `date_taken`:
|
||||
/// `"exif"` | `"exiftool"` | `"filename"` | `"fs_time"`. NULL when
|
||||
/// `date_taken` is NULL (no source resolved it). The per-tick backfill
|
||||
/// drain re-resolves rows whose source is `"fs_time"` once exiftool
|
||||
/// has had a chance to run.
|
||||
pub date_taken_source: Option<String>,
|
||||
}
|
||||
|
||||
// Field order matches the post-migration column order in `image_exif`.
|
||||
@@ -98,6 +104,8 @@ pub struct ImageExif {
|
||||
pub duplicate_of_hash: Option<String>,
|
||||
/// Unix seconds at which the resolve was committed.
|
||||
pub duplicate_decided_at: Option<i64>,
|
||||
/// Which step of the canonical-date waterfall populated `date_taken`.
|
||||
pub date_taken_source: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Insertable)]
|
||||
|
||||
@@ -125,6 +125,7 @@ diesel::table! {
|
||||
dhash_64 -> Nullable<BigInt>,
|
||||
duplicate_of_hash -> Nullable<Text>,
|
||||
duplicate_decided_at -> Nullable<BigInt>,
|
||||
date_taken_source -> Nullable<Text>,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
495
src/date_resolver.rs
Normal file
495
src/date_resolver.rs
Normal file
@@ -0,0 +1,495 @@
|
||||
//! Canonical `date_taken` resolution for ingest and the per-tick backfill
|
||||
//! drain.
|
||||
//!
|
||||
//! The waterfall (in order; first hit wins):
|
||||
//!
|
||||
//! 1. **kamadak-exif** — fast in-process EXIF read. Already done by
|
||||
//! `exif::extract_exif_from_path` for image-bearing formats; callers
|
||||
//! pass that result in via `prior_exif_date` so we don't re-parse.
|
||||
//! 2. **exiftool** — shell-out fallback that reaches places kamadak-exif
|
||||
//! can't: QuickTime/MP4 (`MediaCreateDate`, `TrackCreateDate`,
|
||||
//! `CreateDate`), Apple's `ContentCreateDate`, MakerNote sub-IFDs.
|
||||
//! Required for videos to land a real date; degrades silently when
|
||||
//! `exiftool` isn't on PATH.
|
||||
//! 3. **filename regex** — `memories::extract_date_from_filename` covers
|
||||
//! common screenshot / chat-export / timestamp-named patterns.
|
||||
//! 4. **earliest filesystem time** — `utils::earliest_fs_time` picks the
|
||||
//! earlier of created / modified, which on copied-from-backup files is
|
||||
//! a better proxy for content age than either alone.
|
||||
//!
|
||||
//! `DateSource` records which step won so the per-tick drain can re-resolve
|
||||
//! weak sources (`fs_time`) once exiftool becomes available, and so the
|
||||
//! UI/debug surface can answer "why does this photo show up under this
|
||||
//! date." Note that the previous `/memories` request-time logic preferred
|
||||
//! filename even when EXIF was present; this resolver inverts that — EXIF
|
||||
//! is authoritative when it exists, on the theory that an EXIF
|
||||
//! `DateTimeOriginal` is more reliable than a filename pattern that may
|
||||
//! reflect import time rather than capture time.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use log::{debug, trace, warn};
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::utils::earliest_fs_time;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum DateSource {
|
||||
/// kamadak-exif read DateTime/DateTimeOriginal directly.
|
||||
Exif,
|
||||
/// exiftool fallback caught a video / MakerNote / QuickTime tag.
|
||||
Exiftool,
|
||||
/// `extract_date_from_filename` matched a known pattern.
|
||||
Filename,
|
||||
/// Fell through to `earliest_fs_time(metadata)`.
|
||||
FsTime,
|
||||
}
|
||||
|
||||
impl DateSource {
|
||||
pub fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
DateSource::Exif => "exif",
|
||||
DateSource::Exiftool => "exiftool",
|
||||
DateSource::Filename => "filename",
|
||||
DateSource::FsTime => "fs_time",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct ResolvedDate {
|
||||
pub timestamp: i64,
|
||||
pub source: DateSource,
|
||||
}
|
||||
|
||||
/// Resolve the canonical date for a single file, given an already-extracted
|
||||
/// kamadak-exif date if available. Returns `None` only if every step in the
|
||||
/// waterfall fails — for files that exist on disk this should be vanishingly
|
||||
/// rare (the fs-time fallback alone almost always succeeds).
|
||||
pub fn resolve_date_taken(path: &Path, prior_exif_date: Option<i64>) -> Option<ResolvedDate> {
|
||||
if let Some(ts) = prior_exif_date {
|
||||
return Some(ResolvedDate {
|
||||
timestamp: ts,
|
||||
source: DateSource::Exif,
|
||||
});
|
||||
}
|
||||
if let Some(ts) = exiftool_date_single(path) {
|
||||
return Some(ResolvedDate {
|
||||
timestamp: ts,
|
||||
source: DateSource::Exiftool,
|
||||
});
|
||||
}
|
||||
if let Some(dt) = path
|
||||
.file_name()
|
||||
.and_then(|f| f.to_str())
|
||||
.and_then(crate::memories::extract_date_from_filename)
|
||||
{
|
||||
return Some(ResolvedDate {
|
||||
timestamp: dt.timestamp(),
|
||||
source: DateSource::Filename,
|
||||
});
|
||||
}
|
||||
if let Ok(meta) = std::fs::metadata(path)
|
||||
&& let Some(t) = earliest_fs_time(&meta)
|
||||
{
|
||||
let dt: DateTime<Utc> = t.into();
|
||||
return Some(ResolvedDate {
|
||||
timestamp: dt.timestamp(),
|
||||
source: DateSource::FsTime,
|
||||
});
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Batch waterfall. exiftool runs once over the whole batch (single
|
||||
/// subprocess); everything else is per-file and runs only on misses.
|
||||
/// `prior_exif_dates` lets the caller pass in already-known kamadak dates
|
||||
/// keyed by path; entries without a prior date fall through to exiftool
|
||||
/// and the rest of the waterfall.
|
||||
///
|
||||
/// The per-tick backfill drain is the primary caller — it loads ~500 rows
|
||||
/// at a time and uses one exiftool subprocess to drain the lot.
|
||||
pub fn resolve_dates_batch(
|
||||
paths: &[PathBuf],
|
||||
prior_exif_dates: &HashMap<PathBuf, i64>,
|
||||
) -> HashMap<PathBuf, ResolvedDate> {
|
||||
let mut out: HashMap<PathBuf, ResolvedDate> = HashMap::new();
|
||||
let mut needs_exiftool: Vec<&Path> = Vec::with_capacity(paths.len());
|
||||
|
||||
for path in paths {
|
||||
if let Some(&ts) = prior_exif_dates.get(path) {
|
||||
out.insert(
|
||||
path.clone(),
|
||||
ResolvedDate {
|
||||
timestamp: ts,
|
||||
source: DateSource::Exif,
|
||||
},
|
||||
);
|
||||
} else {
|
||||
needs_exiftool.push(path.as_path());
|
||||
}
|
||||
}
|
||||
|
||||
if !needs_exiftool.is_empty() {
|
||||
let exiftool_results = exiftool_dates_batch(&needs_exiftool);
|
||||
for path in &needs_exiftool {
|
||||
if let Some(&ts) = exiftool_results.get(*path) {
|
||||
out.insert(
|
||||
path.to_path_buf(),
|
||||
ResolvedDate {
|
||||
timestamp: ts,
|
||||
source: DateSource::Exiftool,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for path in paths {
|
||||
if out.contains_key(path) {
|
||||
continue;
|
||||
}
|
||||
if let Some(dt) = path
|
||||
.file_name()
|
||||
.and_then(|f| f.to_str())
|
||||
.and_then(crate::memories::extract_date_from_filename)
|
||||
{
|
||||
out.insert(
|
||||
path.clone(),
|
||||
ResolvedDate {
|
||||
timestamp: dt.timestamp(),
|
||||
source: DateSource::Filename,
|
||||
},
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if let Ok(meta) = std::fs::metadata(path)
|
||||
&& let Some(t) = earliest_fs_time(&meta)
|
||||
{
|
||||
let dt: DateTime<Utc> = t.into();
|
||||
out.insert(
|
||||
path.clone(),
|
||||
ResolvedDate {
|
||||
timestamp: dt.timestamp(),
|
||||
source: DateSource::FsTime,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
/// Tag priority for exiftool extraction. First non-zero value wins.
|
||||
///
|
||||
/// Photos: `DateTimeOriginal` (original capture) and `SubSecDateTimeOriginal`
|
||||
/// are most authoritative. `CreateDate` is a common alias and a sane fallback.
|
||||
///
|
||||
/// Videos: `MediaCreateDate` / `TrackCreateDate` are the QuickTime/MP4
|
||||
/// timestamps. `ContentCreateDate` is Apple's iOS-set tag; it often
|
||||
/// reflects local capture time on iPhone exports better than the others.
|
||||
///
|
||||
/// Notably absent: `FileModifyDate` / `FileAccessDate` — those are
|
||||
/// filesystem-derived and the resolver covers them via the `fs_time`
|
||||
/// fallback. Letting exiftool pull them here would mask "no real EXIF
|
||||
/// date" with a `source = exiftool` row that's no better than fs_time.
|
||||
const EXIFTOOL_DATE_TAGS: &[&str] = &[
|
||||
"DateTimeOriginal",
|
||||
"SubSecDateTimeOriginal",
|
||||
"CreateDate",
|
||||
"MediaCreateDate",
|
||||
"TrackCreateDate",
|
||||
"ContentCreateDate",
|
||||
];
|
||||
|
||||
/// Cache the "exiftool exists on PATH" check across the process lifetime so
|
||||
/// the per-tick backfill doesn't fork a doomed subprocess every iteration on
|
||||
/// deploys without exiftool installed.
|
||||
fn exiftool_available() -> bool {
|
||||
static AVAIL: OnceLock<bool> = OnceLock::new();
|
||||
*AVAIL.get_or_init(|| {
|
||||
let ok = Command::new("exiftool")
|
||||
.arg("-ver")
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.status()
|
||||
.map(|s| s.success())
|
||||
.unwrap_or(false);
|
||||
if !ok {
|
||||
warn!("exiftool not on PATH; date_taken waterfall skips that step");
|
||||
}
|
||||
ok
|
||||
})
|
||||
}
|
||||
|
||||
/// One-file exiftool invocation. Used by the upload + GPS-write paths,
|
||||
/// which deal with one file at a time. The batch path uses
|
||||
/// `exiftool_dates_batch` so we don't pay subprocess startup per row.
|
||||
fn exiftool_date_single(path: &Path) -> Option<i64> {
|
||||
if !exiftool_available() {
|
||||
return None;
|
||||
}
|
||||
let mut cmd = Command::new("exiftool");
|
||||
cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2");
|
||||
for tag in EXIFTOOL_DATE_TAGS {
|
||||
cmd.arg(format!("-{}", tag));
|
||||
}
|
||||
cmd.arg(path);
|
||||
let output = cmd.output().ok()?;
|
||||
if !output.status.success() {
|
||||
trace!("exiftool exited non-zero for {:?}", path);
|
||||
return None;
|
||||
}
|
||||
parse_exiftool_json(&output.stdout)
|
||||
.into_iter()
|
||||
.next()
|
||||
.map(|(_, ts)| ts)
|
||||
}
|
||||
|
||||
/// Drain a batch via a single exiftool subprocess. Paths are fed on stdin
|
||||
/// via `-@ -`, so the argv stays short regardless of batch size — safe for
|
||||
/// libraries with very long path components.
|
||||
fn exiftool_dates_batch(paths: &[&Path]) -> HashMap<PathBuf, i64> {
|
||||
let mut out = HashMap::new();
|
||||
if paths.is_empty() || !exiftool_available() {
|
||||
return out;
|
||||
}
|
||||
|
||||
let mut cmd = Command::new("exiftool");
|
||||
cmd.arg("-j").arg("-q").arg("-d").arg("%s").arg("-fast2");
|
||||
for tag in EXIFTOOL_DATE_TAGS {
|
||||
cmd.arg(format!("-{}", tag));
|
||||
}
|
||||
cmd.arg("-@").arg("-");
|
||||
cmd.stdin(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::null());
|
||||
|
||||
let mut child = match cmd.spawn() {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
warn!("exiftool batch spawn failed: {}", e);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(mut stdin) = child.stdin.take() {
|
||||
for p in paths {
|
||||
// exiftool's argfile reader treats each line as one path; OS
|
||||
// path bytes don't always survive a String round-trip, but
|
||||
// every path we get here originated from rel_path / root_path
|
||||
// strings already, so to-string-lossy is a non-event.
|
||||
if let Err(e) = writeln!(stdin, "{}", p.display()) {
|
||||
warn!("exiftool batch stdin write failed: {}", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let output = match child.wait_with_output() {
|
||||
Ok(o) => o,
|
||||
Err(e) => {
|
||||
warn!("exiftool batch wait failed: {}", e);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
if !output.status.success() {
|
||||
debug!(
|
||||
"exiftool batch exit status {:?}; partial output may still parse",
|
||||
output.status.code()
|
||||
);
|
||||
}
|
||||
for (source, ts) in parse_exiftool_json(&output.stdout) {
|
||||
out.insert(PathBuf::from(source), ts);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// One row per input file. exiftool emits any tag we asked for that was
|
||||
/// present, plus the `SourceFile` it was reading. Tags are JSON values
|
||||
/// because `-d %s` returns the timestamp as a *string* of digits, not a
|
||||
/// number, when the date parses; absent tags are simply missing keys.
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ExiftoolEntry {
|
||||
#[serde(rename = "SourceFile")]
|
||||
source_file: String,
|
||||
#[serde(rename = "DateTimeOriginal")]
|
||||
date_time_original: Option<serde_json::Value>,
|
||||
#[serde(rename = "SubSecDateTimeOriginal")]
|
||||
sub_sec_date_time_original: Option<serde_json::Value>,
|
||||
#[serde(rename = "CreateDate")]
|
||||
create_date: Option<serde_json::Value>,
|
||||
#[serde(rename = "MediaCreateDate")]
|
||||
media_create_date: Option<serde_json::Value>,
|
||||
#[serde(rename = "TrackCreateDate")]
|
||||
track_create_date: Option<serde_json::Value>,
|
||||
#[serde(rename = "ContentCreateDate")]
|
||||
content_create_date: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
fn parse_exiftool_json(stdout: &[u8]) -> Vec<(String, i64)> {
|
||||
let entries: Vec<ExiftoolEntry> = match serde_json::from_slice(stdout) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
// Empty stdout on total failure isn't a parse error worth
|
||||
// logging at warn — the caller already noted the non-zero
|
||||
// exit status.
|
||||
if !stdout.is_empty() {
|
||||
warn!("exiftool JSON parse failed: {}", e);
|
||||
}
|
||||
return Vec::new();
|
||||
}
|
||||
};
|
||||
|
||||
let mut out = Vec::with_capacity(entries.len());
|
||||
for entry in entries {
|
||||
// Walk the priority list. exiftool sometimes returns the literal
|
||||
// string "0000:00:00 00:00:00" for missing-but-allocated date
|
||||
// slots; with `-d %s` that becomes the unix epoch (0). Reject
|
||||
// anything <= 0 so we fall through to the next tag.
|
||||
let tags = [
|
||||
entry.date_time_original.as_ref(),
|
||||
entry.sub_sec_date_time_original.as_ref(),
|
||||
entry.create_date.as_ref(),
|
||||
entry.media_create_date.as_ref(),
|
||||
entry.track_create_date.as_ref(),
|
||||
entry.content_create_date.as_ref(),
|
||||
];
|
||||
let mut chosen: Option<i64> = None;
|
||||
for tag in tags.iter().flatten() {
|
||||
if let Some(ts) = coerce_to_unix_seconds(tag)
|
||||
&& ts > 0
|
||||
{
|
||||
chosen = Some(ts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if let Some(ts) = chosen {
|
||||
out.push((entry.source_file, ts));
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// `-d %s` should hand us a numeric string, but exiftool's JSON encoder
|
||||
/// will emit a number when the tag was defined as numeric in its lib —
|
||||
/// accept both shapes.
|
||||
fn coerce_to_unix_seconds(v: &serde_json::Value) -> Option<i64> {
|
||||
match v {
|
||||
serde_json::Value::String(s) => s.trim().parse::<i64>().ok(),
|
||||
serde_json::Value::Number(n) => n.as_i64(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_picks_first_priority_tag() {
|
||||
let json = br#"[{
|
||||
"SourceFile": "/lib/IMG.jpg",
|
||||
"DateTimeOriginal": "1500000000",
|
||||
"CreateDate": "1400000000"
|
||||
}]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert_eq!(parsed, vec![("/lib/IMG.jpg".to_string(), 1500000000)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_falls_through_zeros() {
|
||||
// exiftool emits "0000:00:00 00:00:00" → unix epoch 0 with -d %s.
|
||||
// The resolver should skip those and pick the next tag.
|
||||
let json = br#"[{
|
||||
"SourceFile": "/lib/clip.mov",
|
||||
"DateTimeOriginal": "0",
|
||||
"MediaCreateDate": "1500000000"
|
||||
}]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert_eq!(parsed, vec![("/lib/clip.mov".to_string(), 1500000000)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_accepts_numeric_values() {
|
||||
let json = br#"[{
|
||||
"SourceFile": "/lib/a.jpg",
|
||||
"CreateDate": 1234567890
|
||||
}]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert_eq!(parsed, vec![("/lib/a.jpg".to_string(), 1234567890)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_emits_nothing_when_no_tag_present() {
|
||||
let json = br#"[{"SourceFile": "/lib/no_dates.bin"}]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert!(parsed.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_exiftool_json_handles_multiple_entries() {
|
||||
let json = br#"[
|
||||
{"SourceFile": "/lib/a.jpg", "DateTimeOriginal": "100"},
|
||||
{"SourceFile": "/lib/b.jpg", "CreateDate": "200"}
|
||||
]"#;
|
||||
let parsed = parse_exiftool_json(json);
|
||||
assert_eq!(
|
||||
parsed,
|
||||
vec![
|
||||
("/lib/a.jpg".to_string(), 100),
|
||||
("/lib/b.jpg".to_string(), 200)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn date_source_as_str_round_trip() {
|
||||
for src in [
|
||||
DateSource::Exif,
|
||||
DateSource::Exiftool,
|
||||
DateSource::Filename,
|
||||
DateSource::FsTime,
|
||||
] {
|
||||
assert!(!src.as_str().is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_uses_prior_exif_when_present() {
|
||||
// Path doesn't need to exist when prior_exif_date short-circuits.
|
||||
let resolved =
|
||||
resolve_date_taken(Path::new("/nonexistent/file.jpg"), Some(1700000000)).unwrap();
|
||||
assert_eq!(resolved.timestamp, 1700000000);
|
||||
assert_eq!(resolved.source, DateSource::Exif);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_filename_when_no_exif_and_file_missing() {
|
||||
// No prior EXIF, no exiftool match (file missing), but the filename
|
||||
// pattern still matches so the resolver lands on Filename.
|
||||
let resolved = resolve_date_taken(
|
||||
Path::new("/nonexistent/Screenshot_2014-06-01-20-44-50.png"),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(resolved.source, DateSource::Filename);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_fs_time_when_only_metadata_available() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("plain.jpg");
|
||||
std::fs::File::create(&path).unwrap();
|
||||
let resolved = resolve_date_taken(&path, None).unwrap();
|
||||
// exiftool may or may not be installed in the test env; either
|
||||
// way the file has no EXIF and no filename date, so we should
|
||||
// fall to fs_time.
|
||||
assert_eq!(resolved.source, DateSource::FsTime);
|
||||
}
|
||||
}
|
||||
33
src/files.rs
33
src/files.rs
@@ -1508,6 +1508,7 @@ mod tests {
|
||||
dhash_64: data.dhash_64,
|
||||
duplicate_of_hash: None,
|
||||
duplicate_decided_at: None,
|
||||
date_taken_source: data.date_taken_source.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1551,6 +1552,7 @@ mod tests {
|
||||
dhash_64: data.dhash_64,
|
||||
duplicate_of_hash: None,
|
||||
duplicate_decided_at: None,
|
||||
date_taken_source: data.date_taken_source.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1644,6 +1646,37 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_rows_needing_date_backfill(
|
||||
&mut self,
|
||||
_context: &opentelemetry::Context,
|
||||
_library_id: i32,
|
||||
_limit: i64,
|
||||
) -> Result<Vec<(i32, String)>, DbError> {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
fn backfill_date_taken(
|
||||
&mut self,
|
||||
_context: &opentelemetry::Context,
|
||||
_library_id: i32,
|
||||
_rel_path: &str,
|
||||
_date_taken: i64,
|
||||
_source: &str,
|
||||
) -> Result<(), DbError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_memories_in_window(
|
||||
&mut self,
|
||||
_context: &opentelemetry::Context,
|
||||
_library_id: i32,
|
||||
_span_token: &str,
|
||||
_years_back: i32,
|
||||
_tz_offset_minutes: i32,
|
||||
) -> Result<Vec<(String, i64, i64)>, DbError> {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
fn find_by_content_hash(
|
||||
&mut self,
|
||||
_context: &opentelemetry::Context,
|
||||
|
||||
@@ -10,6 +10,7 @@ pub mod cleanup;
|
||||
pub mod content_hash;
|
||||
pub mod data;
|
||||
pub mod database;
|
||||
pub mod date_resolver;
|
||||
pub mod duplicates;
|
||||
pub mod error;
|
||||
pub mod exif;
|
||||
|
||||
136
src/main.rs
136
src/main.rs
@@ -64,6 +64,7 @@ mod auth;
|
||||
mod content_hash;
|
||||
mod data;
|
||||
mod database;
|
||||
mod date_resolver;
|
||||
mod duplicates;
|
||||
mod error;
|
||||
mod exif;
|
||||
@@ -503,6 +504,11 @@ async fn set_image_gps(
|
||||
};
|
||||
let now = Utc::now().timestamp();
|
||||
let normalized_path = body.path.replace('\\', "/");
|
||||
// Re-run the canonical-date waterfall on every GPS write — exiftool
|
||||
// writing GPS doesn't change the capture date, but if the row was
|
||||
// previously sourced from `fs_time` the re-read may have given us a
|
||||
// real EXIF date this time, and we want to upgrade the source.
|
||||
let resolved_date = date_resolver::resolve_date_taken(&full_path, extracted.date_taken);
|
||||
let insert_exif = InsertImageExif {
|
||||
library_id: resolved_library.id,
|
||||
file_path: normalized_path.clone(),
|
||||
@@ -519,7 +525,7 @@ async fn set_image_gps(
|
||||
aperture: extracted.aperture.map(|v| v as f32),
|
||||
shutter_speed: extracted.shutter_speed,
|
||||
iso: extracted.iso,
|
||||
date_taken: extracted.date_taken,
|
||||
date_taken: resolved_date.map(|r| r.timestamp),
|
||||
// Created_time is preserved by update_exif (it doesn't touch the
|
||||
// column); pass any int — it's ignored in the UPDATE statement.
|
||||
created_time: now,
|
||||
@@ -537,6 +543,7 @@ async fn set_image_gps(
|
||||
// with a usable signal; failure just leaves prior values in place.
|
||||
phash_64: perceptual_hash::compute(&full_path).map(|h| h.phash_64),
|
||||
dhash_64: perceptual_hash::compute(&full_path).map(|h| h.dhash_64),
|
||||
date_taken_source: resolved_date.map(|r| r.source.as_str().to_string()),
|
||||
};
|
||||
|
||||
let updated = {
|
||||
@@ -749,6 +756,10 @@ async fn upload_image(
|
||||
}
|
||||
};
|
||||
let perceptual = perceptual_hash::compute(&uploaded_path);
|
||||
let resolved_date = date_resolver::resolve_date_taken(
|
||||
&uploaded_path,
|
||||
exif_data.date_taken,
|
||||
);
|
||||
let insert_exif = InsertImageExif {
|
||||
library_id: target_library.id,
|
||||
file_path: relative_path.clone(),
|
||||
@@ -765,13 +776,15 @@ async fn upload_image(
|
||||
aperture: exif_data.aperture.map(|v| v as f32),
|
||||
shutter_speed: exif_data.shutter_speed,
|
||||
iso: exif_data.iso,
|
||||
date_taken: exif_data.date_taken,
|
||||
date_taken: resolved_date.map(|r| r.timestamp),
|
||||
created_time: timestamp,
|
||||
last_modified: timestamp,
|
||||
content_hash,
|
||||
size_bytes,
|
||||
phash_64: perceptual.map(|h| h.phash_64),
|
||||
dhash_64: perceptual.map(|h| h.dhash_64),
|
||||
date_taken_source: resolved_date
|
||||
.map(|r| r.source.as_str().to_string()),
|
||||
};
|
||||
|
||||
if let Ok(mut dao) = exif_dao.lock() {
|
||||
@@ -2112,6 +2125,15 @@ fn watch_files(
|
||||
);
|
||||
}
|
||||
|
||||
// Date-taken backfill: drain rows whose canonical date is
|
||||
// either unresolved or only fs_time-sourced. Independent
|
||||
// of face detection — runs even on deploys that don't
|
||||
// configure Apollo, since `/memories` depends on it.
|
||||
{
|
||||
let context = opentelemetry::Context::new();
|
||||
backfill_missing_date_taken(&context, lib, &exif_dao);
|
||||
}
|
||||
|
||||
if is_full_scan {
|
||||
info!(
|
||||
"Running full scan for library '{}' (scan #{})",
|
||||
@@ -2377,6 +2399,16 @@ fn process_new_files(
|
||||
None
|
||||
};
|
||||
|
||||
// Canonical date_taken via the waterfall — kamadak-exif (already
|
||||
// computed above) → exiftool fallback for videos / MakerNote /
|
||||
// QuickTime → filename regex → earliest_fs_time. Source is
|
||||
// recorded so the per-tick backfill drain can re-run weak
|
||||
// resolutions later.
|
||||
let resolved_date = date_resolver::resolve_date_taken(
|
||||
&file_path,
|
||||
exif_fields.as_ref().and_then(|e| e.date_taken),
|
||||
);
|
||||
|
||||
let insert_exif = InsertImageExif {
|
||||
library_id: library.id,
|
||||
file_path: relative_path.clone(),
|
||||
@@ -2403,13 +2435,14 @@ fn process_new_files(
|
||||
.and_then(|e| e.aperture.map(|v| v as f32)),
|
||||
shutter_speed: exif_fields.as_ref().and_then(|e| e.shutter_speed.clone()),
|
||||
iso: exif_fields.as_ref().and_then(|e| e.iso),
|
||||
date_taken: exif_fields.as_ref().and_then(|e| e.date_taken),
|
||||
date_taken: resolved_date.map(|r| r.timestamp),
|
||||
created_time: timestamp,
|
||||
last_modified: timestamp,
|
||||
content_hash,
|
||||
size_bytes,
|
||||
phash_64: perceptual.map(|h| h.phash_64),
|
||||
dhash_64: perceptual.map(|h| h.dhash_64),
|
||||
date_taken_source: resolved_date.map(|r| r.source.as_str().to_string()),
|
||||
};
|
||||
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
@@ -2682,6 +2715,103 @@ fn backfill_unhashed_backlog(
|
||||
backfilled
|
||||
}
|
||||
|
||||
/// Drain image_exif rows whose `date_taken` was never resolved or was
|
||||
/// resolved by the weakest fallback (`fs_time`). Runs the canonical-date
|
||||
/// waterfall — exiftool batch (one subprocess for the whole tick's
|
||||
/// rows) → filename regex → earliest_fs_time — and persists each
|
||||
/// resolution with its source tag. Capped per tick by
|
||||
/// `DATE_BACKFILL_MAX_PER_TICK` (default 500) so a 14k-row library
|
||||
/// drains over a few quick-scan ticks without blocking the watcher.
|
||||
///
|
||||
/// kamadak-exif is intentionally skipped here: the row already has a
|
||||
/// NULL date_taken because the ingest path's kamadak-exif call returned
|
||||
/// nothing, and re-running it would just produce the same answer.
|
||||
/// exiftool is the meaningful new attempt — it handles videos and
|
||||
/// MakerNote-hosted dates kamadak can't reach.
|
||||
fn backfill_missing_date_taken(
|
||||
context: &opentelemetry::Context,
|
||||
library: &libraries::Library,
|
||||
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
||||
) -> usize {
|
||||
let cap: i64 = dotenv::var("DATE_BACKFILL_MAX_PER_TICK")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.filter(|n: &i64| *n > 0)
|
||||
.unwrap_or(500);
|
||||
|
||||
let rows: Vec<(i32, String)> = {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
dao.get_rows_needing_date_backfill(context, library.id, cap + 1)
|
||||
.unwrap_or_default()
|
||||
};
|
||||
if rows.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let more_than_cap = rows.len() as i64 > cap;
|
||||
let base_path = std::path::Path::new(&library.root_path);
|
||||
|
||||
// Build absolute paths and drop rows whose files no longer exist —
|
||||
// the missing-file scan in library_maintenance retires deleted rows
|
||||
// separately. Without this filter, NULL-date rows for missing files
|
||||
// would loop through the drain forever (no source can resolve them).
|
||||
let mut existing: Vec<(String, PathBuf)> = Vec::with_capacity(rows.len() as usize);
|
||||
for (_, rel_path) in rows.iter().take(cap as usize) {
|
||||
let abs = base_path.join(rel_path);
|
||||
if abs.exists() {
|
||||
existing.push((rel_path.clone(), abs));
|
||||
}
|
||||
}
|
||||
if existing.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// One exiftool subprocess for the whole batch; the resolver falls
|
||||
// through to filename / fs_time per file when exiftool can't supply
|
||||
// a date (or isn't installed at all).
|
||||
let paths: Vec<PathBuf> = existing.iter().map(|(_, p)| p.clone()).collect();
|
||||
let resolved = date_resolver::resolve_dates_batch(&paths, &HashMap::new());
|
||||
|
||||
let mut backfilled = 0usize;
|
||||
let mut unresolved = 0usize;
|
||||
let mut by_source: HashMap<&'static str, usize> = HashMap::new();
|
||||
{
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
for (rel_path, abs) in &existing {
|
||||
let Some(rd) = resolved.get(abs).copied() else {
|
||||
unresolved += 1;
|
||||
continue;
|
||||
};
|
||||
match dao.backfill_date_taken(
|
||||
context,
|
||||
library.id,
|
||||
rel_path,
|
||||
rd.timestamp,
|
||||
rd.source.as_str(),
|
||||
) {
|
||||
Ok(()) => {
|
||||
backfilled += 1;
|
||||
*by_source.entry(rd.source.as_str()).or_insert(0) += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"date_backfill: update failed for lib {} {}: {:?}",
|
||||
library.id, rel_path, e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if backfilled > 0 || unresolved > 0 || more_than_cap {
|
||||
info!(
|
||||
"date_backfill: library '{}': resolved {} ({:?}), {} unresolved, cap={}, more_remain={}",
|
||||
library.name, backfilled, by_source, unresolved, cap, more_than_cap
|
||||
);
|
||||
}
|
||||
backfilled
|
||||
}
|
||||
|
||||
/// Per-tick face-detection drain. Pulls a capped batch of hashed-but-
|
||||
/// unscanned image_exif rows directly via the FaceDao anti-join and
|
||||
/// hands them to the existing detection pass. Runs on every tick (not
|
||||
|
||||
573
src/memories.rs
573
src/memories.rs
@@ -1,25 +1,19 @@
|
||||
use actix_web::web::Data;
|
||||
use actix_web::{HttpRequest, HttpResponse, Responder, get, web};
|
||||
use chrono::LocalResult::{Ambiguous, Single};
|
||||
use chrono::{DateTime, Datelike, FixedOffset, Local, LocalResult, NaiveDate, TimeZone, Utc};
|
||||
use chrono::{DateTime, FixedOffset, Local, LocalResult, NaiveDate, TimeZone};
|
||||
use log::{debug, trace, warn};
|
||||
use opentelemetry::KeyValue;
|
||||
use opentelemetry::trace::{Span, Status, TraceContextExt, Tracer};
|
||||
use rayon::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Mutex;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::data::Claims;
|
||||
use crate::database::ExifDao;
|
||||
use crate::files::is_image_or_video;
|
||||
use crate::libraries::Library;
|
||||
use crate::otel::{extract_context_from_request, global_tracer};
|
||||
use crate::state::AppState;
|
||||
use crate::utils::earliest_fs_time;
|
||||
|
||||
// Helper that encapsulates path-exclusion semantics
|
||||
#[derive(Debug)]
|
||||
@@ -139,22 +133,6 @@ pub struct MemoriesResponse {
|
||||
pub items: Vec<MemoryItem>,
|
||||
}
|
||||
|
||||
/// Convert Unix timestamp to NaiveDate in client timezone
|
||||
fn timestamp_to_naive_date(
|
||||
timestamp: i64,
|
||||
client_timezone: &Option<FixedOffset>,
|
||||
) -> Option<NaiveDate> {
|
||||
let dt_utc = DateTime::<Utc>::from_timestamp(timestamp, 0)?;
|
||||
|
||||
let date = if let Some(tz) = client_timezone {
|
||||
dt_utc.with_timezone(tz).date_naive()
|
||||
} else {
|
||||
dt_utc.with_timezone(&Local).date_naive()
|
||||
};
|
||||
|
||||
Some(date)
|
||||
}
|
||||
|
||||
pub fn extract_date_from_filename(filename: &str) -> Option<DateTime<FixedOffset>> {
|
||||
let build_date_from_ymd_capture =
|
||||
|captures: ®ex::Captures| -> Option<DateTime<FixedOffset>> {
|
||||
@@ -283,232 +261,21 @@ pub fn extract_date_from_filename(filename: &str) -> Option<DateTime<FixedOffset
|
||||
None
|
||||
}
|
||||
|
||||
/// Get the canonical date for a memory with priority: filename → EXIF → metadata
|
||||
/// Returns (NaiveDate for matching, timestamp for display, modified timestamp)
|
||||
fn get_memory_date_with_priority(
|
||||
path: &Path,
|
||||
exif_date_taken: Option<i64>,
|
||||
client_timezone: &Option<FixedOffset>,
|
||||
) -> Option<(NaiveDate, Option<i64>, Option<i64>)> {
|
||||
// Read file metadata once
|
||||
let meta = std::fs::metadata(path).ok()?;
|
||||
|
||||
// Priority 1: Try to extract date from filename
|
||||
if let Some(filename_date) = path
|
||||
.file_name()
|
||||
.and_then(|f| f.to_str())
|
||||
.and_then(extract_date_from_filename)
|
||||
{
|
||||
// Convert to client timezone if specified
|
||||
let date_in_timezone = if let Some(tz) = client_timezone {
|
||||
filename_date.with_timezone(tz)
|
||||
} else {
|
||||
filename_date.with_timezone(&Local).fixed_offset()
|
||||
};
|
||||
|
||||
let timestamp = if let Some(tz) = client_timezone {
|
||||
filename_date.with_timezone(tz).timestamp()
|
||||
} else {
|
||||
filename_date.timestamp()
|
||||
};
|
||||
|
||||
let modified = meta.modified().ok().map(|t| {
|
||||
let utc: DateTime<Utc> = t.into();
|
||||
if let Some(tz) = client_timezone {
|
||||
utc.with_timezone(tz).timestamp()
|
||||
} else {
|
||||
utc.timestamp()
|
||||
}
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Memory date from filename {:?} > {:?} = {:?}",
|
||||
path.file_name(),
|
||||
filename_date,
|
||||
date_in_timezone
|
||||
);
|
||||
return Some((date_in_timezone.date_naive(), Some(timestamp), modified));
|
||||
}
|
||||
|
||||
// Priority 2: Use EXIF date_taken if available
|
||||
if let Some(exif_timestamp) = exif_date_taken {
|
||||
let date = timestamp_to_naive_date(exif_timestamp, client_timezone)?;
|
||||
|
||||
let modified = meta.modified().ok().map(|t| {
|
||||
let utc: DateTime<Utc> = t.into();
|
||||
if let Some(tz) = client_timezone {
|
||||
utc.with_timezone(tz).timestamp()
|
||||
} else {
|
||||
utc.timestamp()
|
||||
}
|
||||
});
|
||||
|
||||
debug!("Memory date from EXIF {:?} = {:?}", path.file_name(), date);
|
||||
return Some((date, Some(exif_timestamp), modified));
|
||||
}
|
||||
|
||||
// Priority 3: Fall back to metadata (earlier of created/modified — see utils::earliest_fs_time)
|
||||
let system_time = earliest_fs_time(&meta)?;
|
||||
let dt_utc: DateTime<Utc> = system_time.into();
|
||||
|
||||
let date_in_timezone = if let Some(tz) = client_timezone {
|
||||
dt_utc.with_timezone(tz).date_naive()
|
||||
} else {
|
||||
dt_utc.with_timezone(&Local).date_naive()
|
||||
};
|
||||
|
||||
let created_timestamp = if let Some(tz) = client_timezone {
|
||||
dt_utc.with_timezone(tz).timestamp()
|
||||
} else {
|
||||
dt_utc.timestamp()
|
||||
};
|
||||
|
||||
let modified = meta.modified().ok().map(|t| {
|
||||
let utc: DateTime<Utc> = t.into();
|
||||
if let Some(tz) = client_timezone {
|
||||
utc.with_timezone(tz).timestamp()
|
||||
} else {
|
||||
utc.timestamp()
|
||||
}
|
||||
});
|
||||
|
||||
trace!("Fallback metadata create date = {:?}", date_in_timezone);
|
||||
Some((date_in_timezone, Some(created_timestamp), modified))
|
||||
/// Convert a `date_taken` Unix-seconds value to a `NaiveDate` in the
|
||||
/// client's local time. Falls back to server-local when the client didn't
|
||||
/// send a tz hint.
|
||||
fn date_in_client_tz(timestamp: i64, client_timezone: Option<FixedOffset>) -> Option<NaiveDate> {
|
||||
let dt = DateTime::from_timestamp(timestamp, 0)?;
|
||||
Some(match client_timezone {
|
||||
Some(tz) => dt.with_timezone(&tz).date_naive(),
|
||||
None => dt.with_timezone(&Local).date_naive(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Collect memories from EXIF database
|
||||
fn collect_exif_memories(
|
||||
exif_dao: &Data<Mutex<Box<dyn ExifDao>>>,
|
||||
context: &opentelemetry::Context,
|
||||
base_path: &str,
|
||||
library_id: i32,
|
||||
now: NaiveDate,
|
||||
span_mode: MemoriesSpan,
|
||||
years_back: u32,
|
||||
client_timezone: &Option<FixedOffset>,
|
||||
path_excluder: &PathExcluder,
|
||||
) -> Vec<(MemoryItem, NaiveDate)> {
|
||||
// Query database for all files with date_taken
|
||||
let exif_records = match exif_dao.lock() {
|
||||
Ok(mut dao) => match dao.get_all_with_date_taken(context, Some(library_id)) {
|
||||
Ok(records) => records,
|
||||
Err(e) => {
|
||||
warn!("Failed to query EXIF database: {:?}", e);
|
||||
return Vec::new(); // Graceful fallback
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("Failed to lock EXIF DAO: {:?}", e);
|
||||
return Vec::new();
|
||||
}
|
||||
};
|
||||
|
||||
// Parallel processing with Rayon
|
||||
exif_records
|
||||
.par_iter()
|
||||
.filter_map(|(file_path, date_taken_ts)| {
|
||||
// Build full path
|
||||
let full_path = Path::new(base_path).join(file_path);
|
||||
|
||||
// Check exclusions
|
||||
if path_excluder.is_excluded(&full_path) {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Verify file exists
|
||||
if !full_path.exists() || !full_path.is_file() {
|
||||
warn!("EXIF record exists but file not found: {:?}", full_path);
|
||||
return None;
|
||||
}
|
||||
|
||||
// Get date with priority: filename → EXIF → metadata
|
||||
// This ensures sorting and display use the same date source
|
||||
let (file_date, created, modified) =
|
||||
get_memory_date_with_priority(&full_path, Some(*date_taken_ts), client_timezone)?;
|
||||
|
||||
// Check if matches memory criteria
|
||||
if !is_memories_match(file_path, file_date, now, span_mode, years_back) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some((
|
||||
MemoryItem {
|
||||
path: file_path.clone(),
|
||||
created,
|
||||
modified,
|
||||
library_id,
|
||||
},
|
||||
file_date,
|
||||
))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Collect memories from file system scan (for files not in EXIF DB)
|
||||
fn collect_filesystem_memories(
|
||||
base_path: &str,
|
||||
library_id: i32,
|
||||
path_excluder: &PathExcluder,
|
||||
skip_paths: &HashSet<PathBuf>,
|
||||
now: NaiveDate,
|
||||
span_mode: MemoriesSpan,
|
||||
years_back: u32,
|
||||
client_timezone: &Option<FixedOffset>,
|
||||
) -> Vec<(MemoryItem, NaiveDate)> {
|
||||
let base = Path::new(base_path);
|
||||
|
||||
let entries: Vec<_> = WalkDir::new(base)
|
||||
.into_iter()
|
||||
.filter_map(|e| e.ok())
|
||||
.filter(|e| {
|
||||
let path = e.path();
|
||||
|
||||
// Skip if already processed by EXIF query
|
||||
if skip_paths.contains(path) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check exclusions
|
||||
if path_excluder.is_excluded(path) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Only process image/video files
|
||||
e.file_type().is_file() && is_image_or_video(path)
|
||||
})
|
||||
.collect();
|
||||
|
||||
entries
|
||||
.par_iter()
|
||||
.filter_map(|entry| {
|
||||
// Use unified date priority function (no EXIF for filesystem scan)
|
||||
let (file_date, created, modified) =
|
||||
get_memory_date_with_priority(entry.path(), None, client_timezone)?;
|
||||
|
||||
if is_memories_match(
|
||||
entry.path().to_str().unwrap_or("Unknown"),
|
||||
file_date,
|
||||
now,
|
||||
span_mode,
|
||||
years_back,
|
||||
) {
|
||||
let path_relative = entry.path().strip_prefix(base).ok()?.to_str()?.to_string();
|
||||
|
||||
Some((
|
||||
MemoryItem {
|
||||
path: path_relative,
|
||||
created,
|
||||
modified,
|
||||
library_id,
|
||||
},
|
||||
file_date,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
/// Default lookback for `/memories`. The original 15-year cap pre-dated
|
||||
/// most of the imported libraries; bumped to 20 so users with deeper
|
||||
/// archives see those photos surface on the matching anniversary too.
|
||||
pub const DEFAULT_YEARS_BACK: i32 = 20;
|
||||
|
||||
#[get("/memories")]
|
||||
pub async fn list_memories(
|
||||
@@ -525,32 +292,28 @@ pub async fn list_memories(
|
||||
opentelemetry::Context::new().with_remote_span_context(span.span_context().clone());
|
||||
|
||||
let span_mode = q.span.unwrap_or(MemoriesSpan::Day);
|
||||
let years_back: u32 = 15;
|
||||
|
||||
// Create timezone from client offset, default to local timezone if not provided
|
||||
let client_timezone = match q.timezone_offset_minutes {
|
||||
Some(offset_mins) => {
|
||||
let offset_secs = offset_mins * 60;
|
||||
Some(
|
||||
FixedOffset::east_opt(offset_secs)
|
||||
.unwrap_or_else(|| FixedOffset::east_opt(0).unwrap()),
|
||||
)
|
||||
}
|
||||
None => None,
|
||||
let span_token = match span_mode {
|
||||
MemoriesSpan::Day => "day",
|
||||
MemoriesSpan::Week => "week",
|
||||
MemoriesSpan::Month => "month",
|
||||
};
|
||||
let years_back: i32 = DEFAULT_YEARS_BACK;
|
||||
|
||||
let now = if let Some(tz) = client_timezone {
|
||||
debug!("Client timezone: {:?}", tz);
|
||||
Utc::now().with_timezone(&tz).date_naive()
|
||||
} else {
|
||||
Local::now().date_naive()
|
||||
};
|
||||
// The SQL filter expects a signed offset in minutes from UTC; default
|
||||
// 0 (UTC) when the client didn't send a hint. We also keep a chrono
|
||||
// `FixedOffset` for sorting/secondary-key date math in Rust below —
|
||||
// anchoring both sides on the same value keeps "what SQL matched" and
|
||||
// "what we sort by" consistent.
|
||||
let tz_offset_minutes = q.timezone_offset_minutes.unwrap_or(0);
|
||||
let client_timezone = q
|
||||
.timezone_offset_minutes
|
||||
.and_then(|offset_mins| FixedOffset::east_opt(offset_mins * 60));
|
||||
|
||||
debug!("Now: {:?}", now);
|
||||
debug!(
|
||||
"list_memories: span={:?} tz_offset_min={} years_back={}",
|
||||
span_mode, tz_offset_minutes, years_back
|
||||
);
|
||||
|
||||
// Resolve the optional library filter. Unknown values are a 400; None
|
||||
// means "all libraries" — currently equivalent to the primary library
|
||||
// while only one is configured.
|
||||
let library = match crate::libraries::resolve_library_param(&app_state, q.library.as_deref()) {
|
||||
Ok(lib) => lib,
|
||||
Err(msg) => {
|
||||
@@ -558,13 +321,13 @@ pub async fn list_memories(
|
||||
return HttpResponse::BadRequest().body(msg);
|
||||
}
|
||||
};
|
||||
// When `library` is `Some`, scope to that one library; otherwise union
|
||||
// across every configured library and let the results interleave.
|
||||
let libraries_to_scan: Vec<&Library> = match library {
|
||||
let libraries_to_scan: Vec<&crate::libraries::Library> = match library {
|
||||
Some(lib) => vec![lib],
|
||||
None => app_state.libraries.iter().collect(),
|
||||
};
|
||||
|
||||
// (item, date) tuples — `date` is the canonical NaiveDate of the
|
||||
// memory in the client's tz, used as the primary sort key.
|
||||
let mut memories_with_dates: Vec<(MemoryItem, NaiveDate)> = Vec::new();
|
||||
|
||||
for lib in &libraries_to_scan {
|
||||
@@ -572,78 +335,82 @@ pub async fn list_memories(
|
||||
let effective = lib.effective_excluded_dirs(&app_state.excluded_dirs);
|
||||
let path_excluder = PathExcluder::new(base, &effective);
|
||||
|
||||
let exif_memories = collect_exif_memories(
|
||||
&exif_dao,
|
||||
&span_context,
|
||||
&lib.root_path,
|
||||
lib.id,
|
||||
now,
|
||||
span_mode,
|
||||
years_back,
|
||||
&client_timezone,
|
||||
&path_excluder,
|
||||
);
|
||||
let rows = match exif_dao.lock() {
|
||||
Ok(mut dao) => match dao.get_memories_in_window(
|
||||
&span_context,
|
||||
lib.id,
|
||||
span_token,
|
||||
years_back,
|
||||
tz_offset_minutes,
|
||||
) {
|
||||
Ok(rows) => rows,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to query memories for library '{}': {:?}",
|
||||
lib.name, e
|
||||
);
|
||||
continue;
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("Failed to lock EXIF DAO: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let exif_paths: HashSet<PathBuf> = exif_memories
|
||||
.iter()
|
||||
.map(|(item, _)| PathBuf::from(&lib.root_path).join(&item.path))
|
||||
.collect();
|
||||
for (rel_path, date_taken_ts, last_modified_ts) in rows {
|
||||
// Apply per-library exclusions in Rust — they're a small
|
||||
// set and pushing them into the SQL WHERE adds bind-param
|
||||
// gymnastics with no measurable win at this scale.
|
||||
let full_path = base.join(&rel_path);
|
||||
if path_excluder.is_excluded(&full_path) {
|
||||
trace!("Memory excluded by PathExcluder: {:?}", full_path);
|
||||
continue;
|
||||
}
|
||||
|
||||
let fs_memories = collect_filesystem_memories(
|
||||
&lib.root_path,
|
||||
lib.id,
|
||||
&path_excluder,
|
||||
&exif_paths,
|
||||
now,
|
||||
span_mode,
|
||||
years_back,
|
||||
&client_timezone,
|
||||
);
|
||||
let Some(file_date) = date_in_client_tz(date_taken_ts, client_timezone) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
memories_with_dates.extend(exif_memories);
|
||||
memories_with_dates.extend(fs_memories);
|
||||
memories_with_dates.push((
|
||||
MemoryItem {
|
||||
path: rel_path,
|
||||
created: Some(date_taken_ts),
|
||||
modified: Some(last_modified_ts),
|
||||
library_id: lib.id,
|
||||
},
|
||||
file_date,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Sort once over the merged result set. The SQL filter handles the
|
||||
// matching; sort order is purely UI concern.
|
||||
match span_mode {
|
||||
// Sort by absolute time for a more 'overview'
|
||||
// Month: chronological — gives an "overview" feel.
|
||||
MemoriesSpan::Month => memories_with_dates.sort_by(|a, b| a.1.cmp(&b.1)),
|
||||
// For week span, sort by full date + timestamp (chronological)
|
||||
// Week: full date then timestamp (oldest → newest).
|
||||
MemoriesSpan::Week => {
|
||||
memories_with_dates.sort_by(|a, b| {
|
||||
// First, sort by full date (year, month, day)
|
||||
let date_cmp = a.1.cmp(&b.1);
|
||||
if date_cmp != std::cmp::Ordering::Equal {
|
||||
return date_cmp;
|
||||
}
|
||||
|
||||
// Then sort by full created timestamp (oldest to newest)
|
||||
match (a.0.created, b.0.created) {
|
||||
(Some(a_time), Some(b_time)) => a_time.cmp(&b_time),
|
||||
(Some(_), None) => std::cmp::Ordering::Less,
|
||||
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||
(None, None) => std::cmp::Ordering::Equal,
|
||||
}
|
||||
});
|
||||
}
|
||||
// For day span, sort by day of month then by time
|
||||
MemoriesSpan::Day => {
|
||||
memories_with_dates.sort_by(|a, b| {
|
||||
let day_comparison = a.1.day().cmp(&b.1.day());
|
||||
|
||||
if day_comparison == std::cmp::Ordering::Equal {
|
||||
match (a.0.created, b.0.created) {
|
||||
(Some(a_time), Some(b_time)) => a_time.cmp(&b_time),
|
||||
a.1.cmp(&b.1)
|
||||
.then_with(|| match (a.0.created, b.0.created) {
|
||||
(Some(at), Some(bt)) => at.cmp(&bt),
|
||||
(Some(_), None) => std::cmp::Ordering::Less,
|
||||
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||
(None, None) => std::cmp::Ordering::Equal,
|
||||
}
|
||||
} else {
|
||||
day_comparison
|
||||
}
|
||||
})
|
||||
});
|
||||
}
|
||||
// Day: same calendar day across years, sub-sorted by timestamp.
|
||||
MemoriesSpan::Day => {
|
||||
memories_with_dates.sort_by(|a, b| match (a.0.created, b.0.created) {
|
||||
(Some(at), Some(bt)) => at.cmp(&bt),
|
||||
(Some(_), None) => std::cmp::Ordering::Less,
|
||||
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||
(None, None) => std::cmp::Ordering::Equal,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Sort by day of the month and time (using the created timestamp)
|
||||
|
||||
let items: Vec<MemoryItem> = memories_with_dates.into_iter().map(|(m, _)| m).collect();
|
||||
|
||||
@@ -653,13 +420,7 @@ pub async fn list_memories(
|
||||
KeyValue::new("span", format!("{:?}", span_mode)),
|
||||
KeyValue::new("years_back", years_back.to_string()),
|
||||
KeyValue::new("result_count", items.len().to_string()),
|
||||
KeyValue::new(
|
||||
"client_timezone",
|
||||
format!(
|
||||
"{:?}",
|
||||
client_timezone.unwrap_or_else(|| FixedOffset::east_opt(0).unwrap())
|
||||
),
|
||||
),
|
||||
KeyValue::new("tz_offset_minutes", tz_offset_minutes.to_string()),
|
||||
KeyValue::new("excluded_dirs", format!("{:?}", app_state.excluded_dirs)),
|
||||
],
|
||||
);
|
||||
@@ -668,50 +429,10 @@ pub async fn list_memories(
|
||||
HttpResponse::Ok().json(MemoriesResponse { items })
|
||||
}
|
||||
|
||||
fn is_memories_match(
|
||||
file_path: &str,
|
||||
file_date: NaiveDate,
|
||||
today: NaiveDate,
|
||||
span: MemoriesSpan,
|
||||
years_back: u32,
|
||||
) -> bool {
|
||||
if file_date > today {
|
||||
return false;
|
||||
}
|
||||
let years_diff = (today.year() - file_date.year()).unsigned_abs();
|
||||
if years_diff > years_back {
|
||||
warn!(
|
||||
"File ({}) date is too far in the past: {:?} vs {:?}",
|
||||
file_path, file_date, today
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
match span {
|
||||
MemoriesSpan::Day => same_month_day_any_year(file_date, today),
|
||||
MemoriesSpan::Week => same_week_any_year(file_date, today),
|
||||
MemoriesSpan::Month => same_month_any_year(file_date, today),
|
||||
}
|
||||
}
|
||||
|
||||
fn same_month_day_any_year(a: NaiveDate, b: NaiveDate) -> bool {
|
||||
a.month() == b.month() && a.day() == b.day()
|
||||
}
|
||||
|
||||
// Match same ISO week number and same weekday (ignoring year)
|
||||
fn same_week_any_year(a: NaiveDate, b: NaiveDate) -> bool {
|
||||
a.iso_week().week().eq(&b.iso_week().week())
|
||||
}
|
||||
|
||||
// Match same month (ignoring day and year)
|
||||
fn same_month_any_year(a: NaiveDate, b: NaiveDate) -> bool {
|
||||
a.month() == b.month()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use chrono::Timelike;
|
||||
use chrono::{Datelike, Timelike};
|
||||
use std::fs::{self, File};
|
||||
use tempfile::tempdir;
|
||||
|
||||
@@ -869,99 +590,11 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_memory_date_priority_filename() {
|
||||
let temp_dir = tempdir().unwrap();
|
||||
let temp_file = temp_dir.path().join("Screenshot_2014-06-01-20-44-50.png");
|
||||
File::create(&temp_file).unwrap();
|
||||
|
||||
// Test that filename takes priority (even with EXIF data available)
|
||||
let exif_date = DateTime::<Utc>::from_timestamp(1609459200, 0) // 2021-01-01
|
||||
.unwrap()
|
||||
.timestamp();
|
||||
|
||||
let (date, created, _) = get_memory_date_with_priority(
|
||||
&temp_file,
|
||||
Some(exif_date),
|
||||
&Some(*Local::now().fixed_offset().offset()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Check that date is from filename (2014), NOT EXIF (2021)
|
||||
assert_eq!(date.year(), 2014);
|
||||
assert_eq!(date.month(), 6);
|
||||
assert_eq!(date.day(), 1);
|
||||
|
||||
// Check that created timestamp matches the date from filename
|
||||
assert!(created.is_some());
|
||||
let ts = created.unwrap();
|
||||
// The timestamp should be for 2014-06-01 20:44:50 in the LOCAL timezone
|
||||
let dt_from_ts = Local.timestamp_opt(ts, 0).unwrap();
|
||||
assert_eq!(dt_from_ts.year(), 2014);
|
||||
assert_eq!(dt_from_ts.month(), 6);
|
||||
assert_eq!(dt_from_ts.day(), 1);
|
||||
assert_eq!(dt_from_ts.hour(), 20);
|
||||
assert_eq!(dt_from_ts.minute(), 44);
|
||||
assert_eq!(dt_from_ts.second(), 50);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_memory_date_priority_metadata_fallback() {
|
||||
let temp_dir = tempdir().unwrap();
|
||||
let temp_file = temp_dir.path().join("regular_image.jpg");
|
||||
File::create(&temp_file).unwrap();
|
||||
|
||||
// Test metadata fallback when no filename date or EXIF
|
||||
let (date, created, modified) =
|
||||
get_memory_date_with_priority(&temp_file, None, &None).unwrap();
|
||||
|
||||
// Both date and timestamps should be from metadata (recent)
|
||||
let today = Local::now().date_naive();
|
||||
assert_eq!(date.year(), today.year());
|
||||
assert_eq!(date.month(), today.month());
|
||||
|
||||
// Both timestamps should be valid
|
||||
assert!(created.is_some());
|
||||
assert!(modified.is_some());
|
||||
|
||||
// Check that timestamps are recent
|
||||
let dt_created = DateTime::<Utc>::from_timestamp(created.unwrap(), 0).unwrap();
|
||||
assert_eq!(dt_created.year(), today.year());
|
||||
|
||||
let dt_modified = DateTime::<Utc>::from_timestamp(modified.unwrap(), 0).unwrap();
|
||||
assert_eq!(dt_modified.year(), today.year());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_memory_date_priority_exif_over_metadata() {
|
||||
let temp_dir = tempdir().unwrap();
|
||||
let temp_file = temp_dir.path().join("regular_image.jpg");
|
||||
File::create(&temp_file).unwrap();
|
||||
|
||||
// Test that EXIF takes priority over metadata (but not filename)
|
||||
// EXIF date: June 15, 2020 12:00:00 UTC (safe from timezone edge cases)
|
||||
let exif_date = DateTime::<Utc>::from_timestamp(1592222400, 0) // 2020-06-15 12:00:00 UTC
|
||||
.unwrap()
|
||||
.timestamp();
|
||||
|
||||
let (date, created, modified) =
|
||||
get_memory_date_with_priority(&temp_file, Some(exif_date), &None).unwrap();
|
||||
|
||||
// Date should be from EXIF (2020), not metadata (today)
|
||||
assert_eq!(date.year(), 2020);
|
||||
assert_eq!(date.month(), 6);
|
||||
assert_eq!(date.day(), 15);
|
||||
|
||||
// Created timestamp should also be from EXIF
|
||||
assert!(created.is_some());
|
||||
assert_eq!(created.unwrap(), exif_date);
|
||||
|
||||
// Modified should still be from metadata
|
||||
assert!(modified.is_some());
|
||||
let today = Local::now().date_naive();
|
||||
let dt_modified = DateTime::<Utc>::from_timestamp(modified.unwrap(), 0).unwrap();
|
||||
assert_eq!(dt_modified.year(), today.year());
|
||||
}
|
||||
// The obsolete `test_memory_date_priority_*` tests covered the old
|
||||
// request-time waterfall in `get_memory_date_with_priority`. Their
|
||||
// replacement lives in `crate::date_resolver::tests` (resolver
|
||||
// waterfall) and the SQL surface is exercised by integration tests
|
||||
// that hit `get_memories_in_window` directly.
|
||||
|
||||
#[test]
|
||||
fn test_path_excluder_absolute_under_base() {
|
||||
|
||||
Reference in New Issue
Block a user