faces: tighten bootstrap candidate filter, bump to 1.1.0
Filter <3-char tags and emoji/symbol-bearing tags out of the bootstrap
candidate list before grouping. Manual testing surfaced these as noise
the operator never tickets — they pushed real candidates lower in the
list and made the UI harder to scan. This is a hard filter (drop from
candidates entirely), not a heuristic flag — looks_like_person still
governs the default-checked decision for the rows that *do* survive.
is_plausible_name_token rules:
- >= 3 chars after trimming (rejects "AB", "OK", whitespace-only)
- Each char is alphabetic (any script — covers Renée, José, 田中太郎),
whitespace, name-punctuation (' - . _ U+2019), or ASCII digit
- Anything else (emoji, symbols, math, arrows, control codes) drops
the whole tag
Digits stay allowed at this layer; looks_like_person handles "Trip 2018"
on the heuristic side. Lets a "Sarah2" alias still appear so the
operator can spot and confirm it manually, just unticked by default.
Cargo version bump 1.0.0 → 1.1.0 marks the face-recog feature surface
landing — Phase 2's schema + endpoints, Phase 3's file-watch hook, and
Phase 4's bootstrap + auto-bind are all behind APOLLO_FACE_API_BASE_URL,
so legacy 1.0 deploys without that env see no behavior change.
Tests: 1 new (faces::tests::is_plausible_name_token_filters_short_and_emoji)
covers the accept-list (Latin/accented/Asian scripts, hyphenated and
apostrophe names) and the reject-list (length floor, emoji classes,
symbols, leading/trailing whitespace handling).
cargo test --lib: 180 / 0; fmt + clippy clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -1913,7 +1913,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "image-api"
|
||||
version = "1.0.0"
|
||||
version = "1.1.0"
|
||||
dependencies = [
|
||||
"actix",
|
||||
"actix-cors",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "image-api"
|
||||
version = "1.0.0"
|
||||
version = "1.1.0"
|
||||
authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
|
||||
edition = "2024"
|
||||
|
||||
|
||||
87
src/faces.rs
87
src/faces.rs
@@ -1456,6 +1456,47 @@ pub struct BootstrapSkipped {
|
||||
pub reason: String,
|
||||
}
|
||||
|
||||
/// Hard filter for the bootstrap candidate list. Returns true if the tag
|
||||
/// could plausibly be a person name; returns false to drop it from the
|
||||
/// candidates entirely (not just leave looks_like_person=false).
|
||||
///
|
||||
/// Rules — all required:
|
||||
/// - At least 3 characters after trimming. Two-letter tags ("AB", "OK")
|
||||
/// are almost always abbreviations or markers, not names.
|
||||
/// - No emoji or symbol-class characters. SQL-side string sort already
|
||||
/// surfaces those at the top of the tag list; filtering them keeps
|
||||
/// the candidate UI focused on names rather than chart-junk.
|
||||
/// - No control characters or null bytes.
|
||||
pub(crate) fn is_plausible_name_token(raw: &str) -> bool {
|
||||
let trimmed = raw.trim();
|
||||
if trimmed.chars().count() < 3 {
|
||||
return false;
|
||||
}
|
||||
for c in trimmed.chars() {
|
||||
// Letter / mark / decimal-digit / connector-punctuation /
|
||||
// dash / apostrophe / period / whitespace are all plausible in a
|
||||
// name. Anything else (emoji, symbols, math operators, arrows,
|
||||
// box drawing, control codes) disqualifies the whole tag.
|
||||
if c.is_alphabetic()
|
||||
|| c.is_whitespace()
|
||||
|| matches!(c, '\'' | '-' | '.' | '_' | '\u{2019}')
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if c.is_ascii_digit() {
|
||||
// Digits don't disqualify here — `looks_like_person` rejects
|
||||
// them later, but `is_plausible_name_token` is just about
|
||||
// "could this be in the candidate list at all?". A tag like
|
||||
// "Sarah2" stays as a candidate (display-flagged not-a-person
|
||||
// by looks_like_person) so the operator can still spot and
|
||||
// confirm it manually if it's an alias.
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Conservative "this tag *might* be a person name" heuristic. False
|
||||
/// negatives are fine — the operator confirms in the UI before any row
|
||||
/// is created. False positives are also fine for the same reason; the
|
||||
@@ -1574,8 +1615,11 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
|
||||
}
|
||||
};
|
||||
|
||||
// Group by lowercase name. Pick the most-frequent capitalization for
|
||||
// the display name (ties broken by first-seen).
|
||||
// Group by lowercase name. Pick the most-frequent capitalization
|
||||
// for the display name (ties broken by first-seen). Filter out
|
||||
// short tags and tags carrying non-name characters (emojis, symbols)
|
||||
// before grouping — they're noise no operator would tick, so showing
|
||||
// them just makes the candidate list harder to scan.
|
||||
struct Group {
|
||||
display: String,
|
||||
display_freq: i64,
|
||||
@@ -1583,6 +1627,9 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
|
||||
}
|
||||
let mut groups: HashMap<String, Group> = HashMap::new();
|
||||
for (count, tag) in tags_with_counts {
|
||||
if !is_plausible_name_token(&tag.name) {
|
||||
continue;
|
||||
}
|
||||
let lower = tag.name.to_lowercase();
|
||||
let g = groups.entry(lower).or_insert_with(|| Group {
|
||||
display: tag.name.clone(),
|
||||
@@ -2282,6 +2329,42 @@ mod tests {
|
||||
|
||||
// ── Phase 4: bootstrap heuristic + cosine + DAO support ─────────────
|
||||
|
||||
#[test]
|
||||
fn is_plausible_name_token_filters_short_and_emoji() {
|
||||
// Hard filter applied before grouping — emojis and tags shorter
|
||||
// than 3 chars never make it into the candidate list, regardless
|
||||
// of looks_like_person's later assessment.
|
||||
assert!(is_plausible_name_token("Cameron"));
|
||||
assert!(is_plausible_name_token("Sarah Smith"));
|
||||
assert!(is_plausible_name_token("O'Brien"));
|
||||
assert!(is_plausible_name_token("Jean-Luc"));
|
||||
assert!(is_plausible_name_token("St. James"));
|
||||
assert!(is_plausible_name_token("Renée"));
|
||||
assert!(is_plausible_name_token("José"));
|
||||
// Asian script names — the alphabetic/letter check covers any
|
||||
// script, not just Latin.
|
||||
assert!(is_plausible_name_token("田中太郎"));
|
||||
|
||||
// Below the 3-character floor.
|
||||
assert!(!is_plausible_name_token(""));
|
||||
assert!(!is_plausible_name_token(" "));
|
||||
assert!(!is_plausible_name_token("Bo"));
|
||||
assert!(!is_plausible_name_token("AB"));
|
||||
// Trim before counting — surrounding whitespace doesn't count.
|
||||
assert!(!is_plausible_name_token(" AB "));
|
||||
|
||||
// Emoji / symbol classes get the whole tag dropped.
|
||||
assert!(!is_plausible_name_token("🐱cat"));
|
||||
assert!(!is_plausible_name_token("Heart ❤"));
|
||||
assert!(!is_plausible_name_token("📸Photo"));
|
||||
assert!(!is_plausible_name_token("→ Trip"));
|
||||
assert!(!is_plausible_name_token("★Vacation"));
|
||||
|
||||
// Digits are kept (handled by looks_like_person, not here).
|
||||
assert!(is_plausible_name_token("Trip 2018"));
|
||||
assert!(is_plausible_name_token("2024"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn looks_like_person_accepts_typical_names() {
|
||||
assert!(looks_like_person("Cameron"));
|
||||
|
||||
Reference in New Issue
Block a user