diff --git a/Cargo.lock b/Cargo.lock index 2023d51..6f6575b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1913,7 +1913,7 @@ dependencies = [ [[package]] name = "image-api" -version = "1.0.0" +version = "1.1.0" dependencies = [ "actix", "actix-cors", diff --git a/Cargo.toml b/Cargo.toml index 1c89808..2432869 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "image-api" -version = "1.0.0" +version = "1.1.0" authors = ["Cameron Cordes "] edition = "2024" diff --git a/src/faces.rs b/src/faces.rs index 1ca6eec..0d06675 100644 --- a/src/faces.rs +++ b/src/faces.rs @@ -1456,6 +1456,47 @@ pub struct BootstrapSkipped { pub reason: String, } +/// Hard filter for the bootstrap candidate list. Returns true if the tag +/// could plausibly be a person name; returns false to drop it from the +/// candidates entirely (not just leave looks_like_person=false). +/// +/// Rules — all required: +/// - At least 3 characters after trimming. Two-letter tags ("AB", "OK") +/// are almost always abbreviations or markers, not names. +/// - No emoji or symbol-class characters. SQL-side string sort already +/// surfaces those at the top of the tag list; filtering them keeps +/// the candidate UI focused on names rather than chart-junk. +/// - No control characters or null bytes. +pub(crate) fn is_plausible_name_token(raw: &str) -> bool { + let trimmed = raw.trim(); + if trimmed.chars().count() < 3 { + return false; + } + for c in trimmed.chars() { + // Letter / mark / decimal-digit / connector-punctuation / + // dash / apostrophe / period / whitespace are all plausible in a + // name. Anything else (emoji, symbols, math operators, arrows, + // box drawing, control codes) disqualifies the whole tag. + if c.is_alphabetic() + || c.is_whitespace() + || matches!(c, '\'' | '-' | '.' | '_' | '\u{2019}') + { + continue; + } + if c.is_ascii_digit() { + // Digits don't disqualify here — `looks_like_person` rejects + // them later, but `is_plausible_name_token` is just about + // "could this be in the candidate list at all?". A tag like + // "Sarah2" stays as a candidate (display-flagged not-a-person + // by looks_like_person) so the operator can still spot and + // confirm it manually if it's an alias. + continue; + } + return false; + } + true +} + /// Conservative "this tag *might* be a person name" heuristic. False /// negatives are fine — the operator confirms in the UI before any row /// is created. False positives are also fine for the same reason; the @@ -1574,8 +1615,11 @@ async fn bootstrap_candidates_handler( } }; - // Group by lowercase name. Pick the most-frequent capitalization for - // the display name (ties broken by first-seen). + // Group by lowercase name. Pick the most-frequent capitalization + // for the display name (ties broken by first-seen). Filter out + // short tags and tags carrying non-name characters (emojis, symbols) + // before grouping — they're noise no operator would tick, so showing + // them just makes the candidate list harder to scan. struct Group { display: String, display_freq: i64, @@ -1583,6 +1627,9 @@ async fn bootstrap_candidates_handler( } let mut groups: HashMap = HashMap::new(); for (count, tag) in tags_with_counts { + if !is_plausible_name_token(&tag.name) { + continue; + } let lower = tag.name.to_lowercase(); let g = groups.entry(lower).or_insert_with(|| Group { display: tag.name.clone(), @@ -2282,6 +2329,42 @@ mod tests { // ── Phase 4: bootstrap heuristic + cosine + DAO support ───────────── + #[test] + fn is_plausible_name_token_filters_short_and_emoji() { + // Hard filter applied before grouping — emojis and tags shorter + // than 3 chars never make it into the candidate list, regardless + // of looks_like_person's later assessment. + assert!(is_plausible_name_token("Cameron")); + assert!(is_plausible_name_token("Sarah Smith")); + assert!(is_plausible_name_token("O'Brien")); + assert!(is_plausible_name_token("Jean-Luc")); + assert!(is_plausible_name_token("St. James")); + assert!(is_plausible_name_token("Renée")); + assert!(is_plausible_name_token("José")); + // Asian script names — the alphabetic/letter check covers any + // script, not just Latin. + assert!(is_plausible_name_token("田中太郎")); + + // Below the 3-character floor. + assert!(!is_plausible_name_token("")); + assert!(!is_plausible_name_token(" ")); + assert!(!is_plausible_name_token("Bo")); + assert!(!is_plausible_name_token("AB")); + // Trim before counting — surrounding whitespace doesn't count. + assert!(!is_plausible_name_token(" AB ")); + + // Emoji / symbol classes get the whole tag dropped. + assert!(!is_plausible_name_token("🐱cat")); + assert!(!is_plausible_name_token("Heart ❤")); + assert!(!is_plausible_name_token("📸Photo")); + assert!(!is_plausible_name_token("→ Trip")); + assert!(!is_plausible_name_token("★Vacation")); + + // Digits are kept (handled by looks_like_person, not here). + assert!(is_plausible_name_token("Trip 2018")); + assert!(is_plausible_name_token("2024")); + } + #[test] fn looks_like_person_accepts_typical_names() { assert!(looks_like_person("Cameron"));