faces: tighten bootstrap candidate filter, bump to 1.1.0

Filter <3-char tags and emoji/symbol-bearing tags out of the bootstrap
candidate list before grouping. Manual testing surfaced these as noise
the operator never tickets — they pushed real candidates lower in the
list and made the UI harder to scan. This is a hard filter (drop from
candidates entirely), not a heuristic flag — looks_like_person still
governs the default-checked decision for the rows that *do* survive.

is_plausible_name_token rules:
  - >= 3 chars after trimming (rejects "AB", "OK", whitespace-only)
  - Each char is alphabetic (any script — covers Renée, José, 田中太郎),
    whitespace, name-punctuation (' - . _ U+2019), or ASCII digit
  - Anything else (emoji, symbols, math, arrows, control codes) drops
    the whole tag

Digits stay allowed at this layer; looks_like_person handles "Trip 2018"
on the heuristic side. Lets a "Sarah2" alias still appear so the
operator can spot and confirm it manually, just unticked by default.

Cargo version bump 1.0.0 → 1.1.0 marks the face-recog feature surface
landing — Phase 2's schema + endpoints, Phase 3's file-watch hook, and
Phase 4's bootstrap + auto-bind are all behind APOLLO_FACE_API_BASE_URL,
so legacy 1.0 deploys without that env see no behavior change.

Tests: 1 new (faces::tests::is_plausible_name_token_filters_short_and_emoji)
covers the accept-list (Latin/accented/Asian scripts, hyphenated and
apostrophe names) and the reject-list (length floor, emoji classes,
symbols, leading/trailing whitespace handling).

cargo test --lib: 180 / 0; fmt + clippy clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-04-29 19:05:04 +00:00
parent 1859399759
commit 41f93d70d1
3 changed files with 87 additions and 4 deletions

2
Cargo.lock generated
View File

@@ -1913,7 +1913,7 @@ dependencies = [
[[package]] [[package]]
name = "image-api" name = "image-api"
version = "1.0.0" version = "1.1.0"
dependencies = [ dependencies = [
"actix", "actix",
"actix-cors", "actix-cors",

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "image-api" name = "image-api"
version = "1.0.0" version = "1.1.0"
authors = ["Cameron Cordes <cameronc.dev@gmail.com>"] authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
edition = "2024" edition = "2024"

View File

@@ -1456,6 +1456,47 @@ pub struct BootstrapSkipped {
pub reason: String, pub reason: String,
} }
/// Hard filter for the bootstrap candidate list. Returns true if the tag
/// could plausibly be a person name; returns false to drop it from the
/// candidates entirely (not just leave looks_like_person=false).
///
/// Rules — all required:
/// - At least 3 characters after trimming. Two-letter tags ("AB", "OK")
/// are almost always abbreviations or markers, not names.
/// - No emoji or symbol-class characters. SQL-side string sort already
/// surfaces those at the top of the tag list; filtering them keeps
/// the candidate UI focused on names rather than chart-junk.
/// - No control characters or null bytes.
pub(crate) fn is_plausible_name_token(raw: &str) -> bool {
let trimmed = raw.trim();
if trimmed.chars().count() < 3 {
return false;
}
for c in trimmed.chars() {
// Letter / mark / decimal-digit / connector-punctuation /
// dash / apostrophe / period / whitespace are all plausible in a
// name. Anything else (emoji, symbols, math operators, arrows,
// box drawing, control codes) disqualifies the whole tag.
if c.is_alphabetic()
|| c.is_whitespace()
|| matches!(c, '\'' | '-' | '.' | '_' | '\u{2019}')
{
continue;
}
if c.is_ascii_digit() {
// Digits don't disqualify here — `looks_like_person` rejects
// them later, but `is_plausible_name_token` is just about
// "could this be in the candidate list at all?". A tag like
// "Sarah2" stays as a candidate (display-flagged not-a-person
// by looks_like_person) so the operator can still spot and
// confirm it manually if it's an alias.
continue;
}
return false;
}
true
}
/// Conservative "this tag *might* be a person name" heuristic. False /// Conservative "this tag *might* be a person name" heuristic. False
/// negatives are fine — the operator confirms in the UI before any row /// negatives are fine — the operator confirms in the UI before any row
/// is created. False positives are also fine for the same reason; the /// is created. False positives are also fine for the same reason; the
@@ -1574,8 +1615,11 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
} }
}; };
// Group by lowercase name. Pick the most-frequent capitalization for // Group by lowercase name. Pick the most-frequent capitalization
// the display name (ties broken by first-seen). // for the display name (ties broken by first-seen). Filter out
// short tags and tags carrying non-name characters (emojis, symbols)
// before grouping — they're noise no operator would tick, so showing
// them just makes the candidate list harder to scan.
struct Group { struct Group {
display: String, display: String,
display_freq: i64, display_freq: i64,
@@ -1583,6 +1627,9 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
} }
let mut groups: HashMap<String, Group> = HashMap::new(); let mut groups: HashMap<String, Group> = HashMap::new();
for (count, tag) in tags_with_counts { for (count, tag) in tags_with_counts {
if !is_plausible_name_token(&tag.name) {
continue;
}
let lower = tag.name.to_lowercase(); let lower = tag.name.to_lowercase();
let g = groups.entry(lower).or_insert_with(|| Group { let g = groups.entry(lower).or_insert_with(|| Group {
display: tag.name.clone(), display: tag.name.clone(),
@@ -2282,6 +2329,42 @@ mod tests {
// ── Phase 4: bootstrap heuristic + cosine + DAO support ───────────── // ── Phase 4: bootstrap heuristic + cosine + DAO support ─────────────
#[test]
fn is_plausible_name_token_filters_short_and_emoji() {
// Hard filter applied before grouping — emojis and tags shorter
// than 3 chars never make it into the candidate list, regardless
// of looks_like_person's later assessment.
assert!(is_plausible_name_token("Cameron"));
assert!(is_plausible_name_token("Sarah Smith"));
assert!(is_plausible_name_token("O'Brien"));
assert!(is_plausible_name_token("Jean-Luc"));
assert!(is_plausible_name_token("St. James"));
assert!(is_plausible_name_token("Renée"));
assert!(is_plausible_name_token("José"));
// Asian script names — the alphabetic/letter check covers any
// script, not just Latin.
assert!(is_plausible_name_token("田中太郎"));
// Below the 3-character floor.
assert!(!is_plausible_name_token(""));
assert!(!is_plausible_name_token(" "));
assert!(!is_plausible_name_token("Bo"));
assert!(!is_plausible_name_token("AB"));
// Trim before counting — surrounding whitespace doesn't count.
assert!(!is_plausible_name_token(" AB "));
// Emoji / symbol classes get the whole tag dropped.
assert!(!is_plausible_name_token("🐱cat"));
assert!(!is_plausible_name_token("Heart ❤"));
assert!(!is_plausible_name_token("📸Photo"));
assert!(!is_plausible_name_token("→ Trip"));
assert!(!is_plausible_name_token("★Vacation"));
// Digits are kept (handled by looks_like_person, not here).
assert!(is_plausible_name_token("Trip 2018"));
assert!(is_plausible_name_token("2024"));
}
#[test] #[test]
fn looks_like_person_accepts_typical_names() { fn looks_like_person_accepts_typical_names() {
assert!(looks_like_person("Cameron")); assert!(looks_like_person("Cameron"));