From 41f93d70d179f512d9091b7fbd5c6802ea7867a7 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 29 Apr 2026 19:05:04 +0000 Subject: [PATCH] faces: tighten bootstrap candidate filter, bump to 1.1.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filter <3-char tags and emoji/symbol-bearing tags out of the bootstrap candidate list before grouping. Manual testing surfaced these as noise the operator never tickets — they pushed real candidates lower in the list and made the UI harder to scan. This is a hard filter (drop from candidates entirely), not a heuristic flag — looks_like_person still governs the default-checked decision for the rows that *do* survive. is_plausible_name_token rules: - >= 3 chars after trimming (rejects "AB", "OK", whitespace-only) - Each char is alphabetic (any script — covers Renée, José, 田中太郎), whitespace, name-punctuation (' - . _ U+2019), or ASCII digit - Anything else (emoji, symbols, math, arrows, control codes) drops the whole tag Digits stay allowed at this layer; looks_like_person handles "Trip 2018" on the heuristic side. Lets a "Sarah2" alias still appear so the operator can spot and confirm it manually, just unticked by default. Cargo version bump 1.0.0 → 1.1.0 marks the face-recog feature surface landing — Phase 2's schema + endpoints, Phase 3's file-watch hook, and Phase 4's bootstrap + auto-bind are all behind APOLLO_FACE_API_BASE_URL, so legacy 1.0 deploys without that env see no behavior change. Tests: 1 new (faces::tests::is_plausible_name_token_filters_short_and_emoji) covers the accept-list (Latin/accented/Asian scripts, hyphenated and apostrophe names) and the reject-list (length floor, emoji classes, symbols, leading/trailing whitespace handling). cargo test --lib: 180 / 0; fmt + clippy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/faces.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 87 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2023d51..6f6575b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1913,7 +1913,7 @@ dependencies = [ [[package]] name = "image-api" -version = "1.0.0" +version = "1.1.0" dependencies = [ "actix", "actix-cors", diff --git a/Cargo.toml b/Cargo.toml index 1c89808..2432869 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "image-api" -version = "1.0.0" +version = "1.1.0" authors = ["Cameron Cordes "] edition = "2024" diff --git a/src/faces.rs b/src/faces.rs index 1ca6eec..0d06675 100644 --- a/src/faces.rs +++ b/src/faces.rs @@ -1456,6 +1456,47 @@ pub struct BootstrapSkipped { pub reason: String, } +/// Hard filter for the bootstrap candidate list. Returns true if the tag +/// could plausibly be a person name; returns false to drop it from the +/// candidates entirely (not just leave looks_like_person=false). +/// +/// Rules — all required: +/// - At least 3 characters after trimming. Two-letter tags ("AB", "OK") +/// are almost always abbreviations or markers, not names. +/// - No emoji or symbol-class characters. SQL-side string sort already +/// surfaces those at the top of the tag list; filtering them keeps +/// the candidate UI focused on names rather than chart-junk. +/// - No control characters or null bytes. +pub(crate) fn is_plausible_name_token(raw: &str) -> bool { + let trimmed = raw.trim(); + if trimmed.chars().count() < 3 { + return false; + } + for c in trimmed.chars() { + // Letter / mark / decimal-digit / connector-punctuation / + // dash / apostrophe / period / whitespace are all plausible in a + // name. Anything else (emoji, symbols, math operators, arrows, + // box drawing, control codes) disqualifies the whole tag. + if c.is_alphabetic() + || c.is_whitespace() + || matches!(c, '\'' | '-' | '.' | '_' | '\u{2019}') + { + continue; + } + if c.is_ascii_digit() { + // Digits don't disqualify here — `looks_like_person` rejects + // them later, but `is_plausible_name_token` is just about + // "could this be in the candidate list at all?". A tag like + // "Sarah2" stays as a candidate (display-flagged not-a-person + // by looks_like_person) so the operator can still spot and + // confirm it manually if it's an alias. + continue; + } + return false; + } + true +} + /// Conservative "this tag *might* be a person name" heuristic. False /// negatives are fine — the operator confirms in the UI before any row /// is created. False positives are also fine for the same reason; the @@ -1574,8 +1615,11 @@ async fn bootstrap_candidates_handler( } }; - // Group by lowercase name. Pick the most-frequent capitalization for - // the display name (ties broken by first-seen). + // Group by lowercase name. Pick the most-frequent capitalization + // for the display name (ties broken by first-seen). Filter out + // short tags and tags carrying non-name characters (emojis, symbols) + // before grouping — they're noise no operator would tick, so showing + // them just makes the candidate list harder to scan. struct Group { display: String, display_freq: i64, @@ -1583,6 +1627,9 @@ async fn bootstrap_candidates_handler( } let mut groups: HashMap = HashMap::new(); for (count, tag) in tags_with_counts { + if !is_plausible_name_token(&tag.name) { + continue; + } let lower = tag.name.to_lowercase(); let g = groups.entry(lower).or_insert_with(|| Group { display: tag.name.clone(), @@ -2282,6 +2329,42 @@ mod tests { // ── Phase 4: bootstrap heuristic + cosine + DAO support ───────────── + #[test] + fn is_plausible_name_token_filters_short_and_emoji() { + // Hard filter applied before grouping — emojis and tags shorter + // than 3 chars never make it into the candidate list, regardless + // of looks_like_person's later assessment. + assert!(is_plausible_name_token("Cameron")); + assert!(is_plausible_name_token("Sarah Smith")); + assert!(is_plausible_name_token("O'Brien")); + assert!(is_plausible_name_token("Jean-Luc")); + assert!(is_plausible_name_token("St. James")); + assert!(is_plausible_name_token("Renée")); + assert!(is_plausible_name_token("José")); + // Asian script names — the alphabetic/letter check covers any + // script, not just Latin. + assert!(is_plausible_name_token("田中太郎")); + + // Below the 3-character floor. + assert!(!is_plausible_name_token("")); + assert!(!is_plausible_name_token(" ")); + assert!(!is_plausible_name_token("Bo")); + assert!(!is_plausible_name_token("AB")); + // Trim before counting — surrounding whitespace doesn't count. + assert!(!is_plausible_name_token(" AB ")); + + // Emoji / symbol classes get the whole tag dropped. + assert!(!is_plausible_name_token("🐱cat")); + assert!(!is_plausible_name_token("Heart ❤")); + assert!(!is_plausible_name_token("📸Photo")); + assert!(!is_plausible_name_token("→ Trip")); + assert!(!is_plausible_name_token("★Vacation")); + + // Digits are kept (handled by looks_like_person, not here). + assert!(is_plausible_name_token("Trip 2018")); + assert!(is_plausible_name_token("2024")); + } + #[test] fn looks_like_person_accepts_typical_names() { assert!(looks_like_person("Cameron"));