faces: tighten bootstrap candidate filter, bump to 1.1.0

Filter <3-char tags and emoji/symbol-bearing tags out of the bootstrap candidate list before grouping. Manual testing surfaced these as noise the operator never tickets — they pushed real candidates lower in the list and made the UI harder to scan. This is a hard filter (drop from candidates entirely), not a heuristic flag — looks_like_person still governs the default-checked decision for the rows that *do* survive. is_plausible_name_token rules: - >= 3 chars after trimming (rejects "AB", "OK", whitespace-only) - Each char is alphabetic (any script — covers Renée, José, 田中太郎), whitespace, name-punctuation (' - . _ U+2019), or ASCII digit - Anything else (emoji, symbols, math, arrows, control codes) drops the whole tag Digits stay allowed at this layer; looks_like_person handles "Trip 2018" on the heuristic side. Lets a "Sarah2" alias still appear so the operator can spot and confirm it manually, just unticked by default. Cargo version bump 1.0.0 → 1.1.0 marks the face-recog feature surface landing — Phase 2's schema + endpoints, Phase 3's file-watch hook, and Phase 4's bootstrap + auto-bind are all behind APOLLO_FACE_API_BASE_URL, so legacy 1.0 deploys without that env see no behavior change. Tests: 1 new (faces::tests::is_plausible_name_token_filters_short_and_emoji) covers the accept-list (Latin/accented/Asian scripts, hyphenated and apostrophe names) and the reject-list (length floor, emoji classes, symbols, leading/trailing whitespace handling). cargo test --lib: 180 / 0; fmt + clippy clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 19:05:04 +00:00
parent 1859399759
commit 41f93d70d1
3 changed files with 87 additions and 4 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1913,7 +1913,7 @@ dependencies = [

 [[package]]
 name = "image-api"
-version = "1.0.0"
+version = "1.1.0"
 dependencies = [
 "actix",
 "actix-cors",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "image-api"
-version = "1.0.0"
+version = "1.1.0"
 authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
 edition = "2024"

--- a/src/faces.rs
+++ b/src/faces.rs
@@ -1456,6 +1456,47 @@ pub struct BootstrapSkipped {
    pub reason: String,
 }

+/// Hard filter for the bootstrap candidate list. Returns true if the tag
+/// could plausibly be a person name; returns false to drop it from the
+/// candidates entirely (not just leave looks_like_person=false).
+///
+/// Rules — all required:
+/// - At least 3 characters after trimming. Two-letter tags ("AB", "OK")
+///   are almost always abbreviations or markers, not names.
+/// - No emoji or symbol-class characters. SQL-side string sort already
+///   surfaces those at the top of the tag list; filtering them keeps
+///   the candidate UI focused on names rather than chart-junk.
+/// - No control characters or null bytes.
+pub(crate) fn is_plausible_name_token(raw: &str) -> bool {
+    let trimmed = raw.trim();
+    if trimmed.chars().count() < 3 {
+        return false;
+    }
+    for c in trimmed.chars() {
+        // Letter / mark / decimal-digit / connector-punctuation /
+        // dash / apostrophe / period / whitespace are all plausible in a
+        // name. Anything else (emoji, symbols, math operators, arrows,
+        // box drawing, control codes) disqualifies the whole tag.
+        if c.is_alphabetic()
+            || c.is_whitespace()
+            || matches!(c, '\'' | '-' | '.' | '_' | '\u{2019}')
+        {
+            continue;
+        }
+        if c.is_ascii_digit() {
+            // Digits don't disqualify here — `looks_like_person` rejects
+            // them later, but `is_plausible_name_token` is just about
+            // "could this be in the candidate list at all?". A tag like
+            // "Sarah2" stays as a candidate (display-flagged not-a-person
+            // by looks_like_person) so the operator can still spot and
+            // confirm it manually if it's an alias.
+            continue;
+        }
+        return false;
+    }
+    true
+}
+
 /// Conservative "this tag *might* be a person name" heuristic. False
 /// negatives are fine — the operator confirms in the UI before any row
 /// is created. False positives are also fine for the same reason; the
@@ -1574,8 +1615,11 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
        }
    };

-    // Group by lowercase name. Pick the most-frequent capitalization for
-    // the display name (ties broken by first-seen).
+    // Group by lowercase name. Pick the most-frequent capitalization
+    // for the display name (ties broken by first-seen). Filter out
+    // short tags and tags carrying non-name characters (emojis, symbols)
+    // before grouping — they're noise no operator would tick, so showing
+    // them just makes the candidate list harder to scan.
    struct Group {
        display: String,
        display_freq: i64,
@@ -1583,6 +1627,9 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
    }
    let mut groups: HashMap<String, Group> = HashMap::new();
    for (count, tag) in tags_with_counts {
+        if !is_plausible_name_token(&tag.name) {
+            continue;
+        }
        let lower = tag.name.to_lowercase();
        let g = groups.entry(lower).or_insert_with(|| Group {
            display: tag.name.clone(),
@@ -2282,6 +2329,42 @@ mod tests {

    // ── Phase 4: bootstrap heuristic + cosine + DAO support ─────────────

+    #[test]
+    fn is_plausible_name_token_filters_short_and_emoji() {
+        // Hard filter applied before grouping — emojis and tags shorter
+        // than 3 chars never make it into the candidate list, regardless
+        // of looks_like_person's later assessment.
+        assert!(is_plausible_name_token("Cameron"));
+        assert!(is_plausible_name_token("Sarah Smith"));
+        assert!(is_plausible_name_token("O'Brien"));
+        assert!(is_plausible_name_token("Jean-Luc"));
+        assert!(is_plausible_name_token("St. James"));
+        assert!(is_plausible_name_token("Renée"));
+        assert!(is_plausible_name_token("José"));
+        // Asian script names — the alphabetic/letter check covers any
+        // script, not just Latin.
+        assert!(is_plausible_name_token("田中太郎"));
+
+        // Below the 3-character floor.
+        assert!(!is_plausible_name_token(""));
+        assert!(!is_plausible_name_token(" "));
+        assert!(!is_plausible_name_token("Bo"));
+        assert!(!is_plausible_name_token("AB"));
+        // Trim before counting — surrounding whitespace doesn't count.
+        assert!(!is_plausible_name_token("  AB  "));
+
+        // Emoji / symbol classes get the whole tag dropped.
+        assert!(!is_plausible_name_token("🐱cat"));
+        assert!(!is_plausible_name_token("Heart ❤"));
+        assert!(!is_plausible_name_token("📸Photo"));
+        assert!(!is_plausible_name_token("→ Trip"));
+        assert!(!is_plausible_name_token("★Vacation"));
+
+        // Digits are kept (handled by looks_like_person, not here).
+        assert!(is_plausible_name_token("Trip 2018"));
+        assert!(is_plausible_name_token("2024"));
+    }
+
    #[test]
    fn looks_like_person_accepts_typical_names() {
        assert!(looks_like_person("Cameron"));