From 41f93d70d179f512d9091b7fbd5c6802ea7867a7 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 29 Apr 2026 19:05:04 +0000
Subject: [PATCH] faces: tighten bootstrap candidate filter, bump to 1.1.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Filter <3-char tags and emoji/symbol-bearing tags out of the bootstrap
candidate list before grouping. Manual testing surfaced these as noise
the operator never tickets — they pushed real candidates lower in the
list and made the UI harder to scan. This is a hard filter (drop from
candidates entirely), not a heuristic flag — looks_like_person still
governs the default-checked decision for the rows that *do* survive.

is_plausible_name_token rules:
  - >= 3 chars after trimming (rejects "AB", "OK", whitespace-only)
  - Each char is alphabetic (any script — covers Renée, José, 田中太郎),
    whitespace, name-punctuation (' - . _ U+2019), or ASCII digit
  - Anything else (emoji, symbols, math, arrows, control codes) drops
    the whole tag

Digits stay allowed at this layer; looks_like_person handles "Trip 2018"
on the heuristic side. Lets a "Sarah2" alias still appear so the
operator can spot and confirm it manually, just unticked by default.

Cargo version bump 1.0.0 → 1.1.0 marks the face-recog feature surface
landing — Phase 2's schema + endpoints, Phase 3's file-watch hook, and
Phase 4's bootstrap + auto-bind are all behind APOLLO_FACE_API_BASE_URL,
so legacy 1.0 deploys without that env see no behavior change.

Tests: 1 new (faces::tests::is_plausible_name_token_filters_short_and_emoji)
covers the accept-list (Latin/accented/Asian scripts, hyphenated and
apostrophe names) and the reject-list (length floor, emoji classes,
symbols, leading/trailing whitespace handling).

cargo test --lib: 180 / 0; fmt + clippy clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock   |  2 +-
 Cargo.toml   |  2 +-
 src/faces.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2023d51..6f6575b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1913,7 +1913,7 @@ dependencies = [
 
 [[package]]
 name = "image-api"
-version = "1.0.0"
+version = "1.1.0"
 dependencies = [
  "actix",
  "actix-cors",
diff --git a/Cargo.toml b/Cargo.toml
index 1c89808..2432869 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "image-api"
-version = "1.0.0"
+version = "1.1.0"
 authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
 edition = "2024"
 
diff --git a/src/faces.rs b/src/faces.rs
index 1ca6eec..0d06675 100644
--- a/src/faces.rs
+++ b/src/faces.rs
@@ -1456,6 +1456,47 @@ pub struct BootstrapSkipped {
     pub reason: String,
 }
 
+/// Hard filter for the bootstrap candidate list. Returns true if the tag
+/// could plausibly be a person name; returns false to drop it from the
+/// candidates entirely (not just leave looks_like_person=false).
+///
+/// Rules — all required:
+/// - At least 3 characters after trimming. Two-letter tags ("AB", "OK")
+///   are almost always abbreviations or markers, not names.
+/// - No emoji or symbol-class characters. SQL-side string sort already
+///   surfaces those at the top of the tag list; filtering them keeps
+///   the candidate UI focused on names rather than chart-junk.
+/// - No control characters or null bytes.
+pub(crate) fn is_plausible_name_token(raw: &str) -> bool {
+    let trimmed = raw.trim();
+    if trimmed.chars().count() < 3 {
+        return false;
+    }
+    for c in trimmed.chars() {
+        // Letter / mark / decimal-digit / connector-punctuation /
+        // dash / apostrophe / period / whitespace are all plausible in a
+        // name. Anything else (emoji, symbols, math operators, arrows,
+        // box drawing, control codes) disqualifies the whole tag.
+        if c.is_alphabetic()
+            || c.is_whitespace()
+            || matches!(c, '\'' | '-' | '.' | '_' | '\u{2019}')
+        {
+            continue;
+        }
+        if c.is_ascii_digit() {
+            // Digits don't disqualify here — `looks_like_person` rejects
+            // them later, but `is_plausible_name_token` is just about
+            // "could this be in the candidate list at all?". A tag like
+            // "Sarah2" stays as a candidate (display-flagged not-a-person
+            // by looks_like_person) so the operator can still spot and
+            // confirm it manually if it's an alias.
+            continue;
+        }
+        return false;
+    }
+    true
+}
+
 /// Conservative "this tag *might* be a person name" heuristic. False
 /// negatives are fine — the operator confirms in the UI before any row
 /// is created. False positives are also fine for the same reason; the
@@ -1574,8 +1615,11 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
         }
     };
 
-    // Group by lowercase name. Pick the most-frequent capitalization for
-    // the display name (ties broken by first-seen).
+    // Group by lowercase name. Pick the most-frequent capitalization
+    // for the display name (ties broken by first-seen). Filter out
+    // short tags and tags carrying non-name characters (emojis, symbols)
+    // before grouping — they're noise no operator would tick, so showing
+    // them just makes the candidate list harder to scan.
     struct Group {
         display: String,
         display_freq: i64,
@@ -1583,6 +1627,9 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
     }
     let mut groups: HashMap<String, Group> = HashMap::new();
     for (count, tag) in tags_with_counts {
+        if !is_plausible_name_token(&tag.name) {
+            continue;
+        }
         let lower = tag.name.to_lowercase();
         let g = groups.entry(lower).or_insert_with(|| Group {
             display: tag.name.clone(),
@@ -2282,6 +2329,42 @@ mod tests {
 
     // ── Phase 4: bootstrap heuristic + cosine + DAO support ─────────────
 
+    #[test]
+    fn is_plausible_name_token_filters_short_and_emoji() {
+        // Hard filter applied before grouping — emojis and tags shorter
+        // than 3 chars never make it into the candidate list, regardless
+        // of looks_like_person's later assessment.
+        assert!(is_plausible_name_token("Cameron"));
+        assert!(is_plausible_name_token("Sarah Smith"));
+        assert!(is_plausible_name_token("O'Brien"));
+        assert!(is_plausible_name_token("Jean-Luc"));
+        assert!(is_plausible_name_token("St. James"));
+        assert!(is_plausible_name_token("Renée"));
+        assert!(is_plausible_name_token("José"));
+        // Asian script names — the alphabetic/letter check covers any
+        // script, not just Latin.
+        assert!(is_plausible_name_token("田中太郎"));
+
+        // Below the 3-character floor.
+        assert!(!is_plausible_name_token(""));
+        assert!(!is_plausible_name_token(" "));
+        assert!(!is_plausible_name_token("Bo"));
+        assert!(!is_plausible_name_token("AB"));
+        // Trim before counting — surrounding whitespace doesn't count.
+        assert!(!is_plausible_name_token("  AB  "));
+
+        // Emoji / symbol classes get the whole tag dropped.
+        assert!(!is_plausible_name_token("🐱cat"));
+        assert!(!is_plausible_name_token("Heart ❤"));
+        assert!(!is_plausible_name_token("📸Photo"));
+        assert!(!is_plausible_name_token("→ Trip"));
+        assert!(!is_plausible_name_token("★Vacation"));
+
+        // Digits are kept (handled by looks_like_person, not here).
+        assert!(is_plausible_name_token("Trip 2018"));
+        assert!(is_plausible_name_token("2024"));
+    }
+
     #[test]
     fn looks_like_person_accepts_typical_names() {
         assert!(looks_like_person("Cameron"));