Face Recognition / People Integration #61
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -1913,7 +1913,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "image-api"
|
name = "image-api"
|
||||||
version = "1.0.0"
|
version = "1.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix",
|
"actix",
|
||||||
"actix-cors",
|
"actix-cors",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "image-api"
|
name = "image-api"
|
||||||
version = "1.0.0"
|
version = "1.1.0"
|
||||||
authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
|
authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
|
|||||||
87
src/faces.rs
87
src/faces.rs
@@ -1456,6 +1456,47 @@ pub struct BootstrapSkipped {
|
|||||||
pub reason: String,
|
pub reason: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Hard filter for the bootstrap candidate list. Returns true if the tag
|
||||||
|
/// could plausibly be a person name; returns false to drop it from the
|
||||||
|
/// candidates entirely (not just leave looks_like_person=false).
|
||||||
|
///
|
||||||
|
/// Rules — all required:
|
||||||
|
/// - At least 3 characters after trimming. Two-letter tags ("AB", "OK")
|
||||||
|
/// are almost always abbreviations or markers, not names.
|
||||||
|
/// - No emoji or symbol-class characters. SQL-side string sort already
|
||||||
|
/// surfaces those at the top of the tag list; filtering them keeps
|
||||||
|
/// the candidate UI focused on names rather than chart-junk.
|
||||||
|
/// - No control characters or null bytes.
|
||||||
|
pub(crate) fn is_plausible_name_token(raw: &str) -> bool {
|
||||||
|
let trimmed = raw.trim();
|
||||||
|
if trimmed.chars().count() < 3 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for c in trimmed.chars() {
|
||||||
|
// Letter / mark / decimal-digit / connector-punctuation /
|
||||||
|
// dash / apostrophe / period / whitespace are all plausible in a
|
||||||
|
// name. Anything else (emoji, symbols, math operators, arrows,
|
||||||
|
// box drawing, control codes) disqualifies the whole tag.
|
||||||
|
if c.is_alphabetic()
|
||||||
|
|| c.is_whitespace()
|
||||||
|
|| matches!(c, '\'' | '-' | '.' | '_' | '\u{2019}')
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if c.is_ascii_digit() {
|
||||||
|
// Digits don't disqualify here — `looks_like_person` rejects
|
||||||
|
// them later, but `is_plausible_name_token` is just about
|
||||||
|
// "could this be in the candidate list at all?". A tag like
|
||||||
|
// "Sarah2" stays as a candidate (display-flagged not-a-person
|
||||||
|
// by looks_like_person) so the operator can still spot and
|
||||||
|
// confirm it manually if it's an alias.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
/// Conservative "this tag *might* be a person name" heuristic. False
|
/// Conservative "this tag *might* be a person name" heuristic. False
|
||||||
/// negatives are fine — the operator confirms in the UI before any row
|
/// negatives are fine — the operator confirms in the UI before any row
|
||||||
/// is created. False positives are also fine for the same reason; the
|
/// is created. False positives are also fine for the same reason; the
|
||||||
@@ -1574,8 +1615,11 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Group by lowercase name. Pick the most-frequent capitalization for
|
// Group by lowercase name. Pick the most-frequent capitalization
|
||||||
// the display name (ties broken by first-seen).
|
// for the display name (ties broken by first-seen). Filter out
|
||||||
|
// short tags and tags carrying non-name characters (emojis, symbols)
|
||||||
|
// before grouping — they're noise no operator would tick, so showing
|
||||||
|
// them just makes the candidate list harder to scan.
|
||||||
struct Group {
|
struct Group {
|
||||||
display: String,
|
display: String,
|
||||||
display_freq: i64,
|
display_freq: i64,
|
||||||
@@ -1583,6 +1627,9 @@ async fn bootstrap_candidates_handler<D: FaceDao>(
|
|||||||
}
|
}
|
||||||
let mut groups: HashMap<String, Group> = HashMap::new();
|
let mut groups: HashMap<String, Group> = HashMap::new();
|
||||||
for (count, tag) in tags_with_counts {
|
for (count, tag) in tags_with_counts {
|
||||||
|
if !is_plausible_name_token(&tag.name) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
let lower = tag.name.to_lowercase();
|
let lower = tag.name.to_lowercase();
|
||||||
let g = groups.entry(lower).or_insert_with(|| Group {
|
let g = groups.entry(lower).or_insert_with(|| Group {
|
||||||
display: tag.name.clone(),
|
display: tag.name.clone(),
|
||||||
@@ -2282,6 +2329,42 @@ mod tests {
|
|||||||
|
|
||||||
// ── Phase 4: bootstrap heuristic + cosine + DAO support ─────────────
|
// ── Phase 4: bootstrap heuristic + cosine + DAO support ─────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn is_plausible_name_token_filters_short_and_emoji() {
|
||||||
|
// Hard filter applied before grouping — emojis and tags shorter
|
||||||
|
// than 3 chars never make it into the candidate list, regardless
|
||||||
|
// of looks_like_person's later assessment.
|
||||||
|
assert!(is_plausible_name_token("Cameron"));
|
||||||
|
assert!(is_plausible_name_token("Sarah Smith"));
|
||||||
|
assert!(is_plausible_name_token("O'Brien"));
|
||||||
|
assert!(is_plausible_name_token("Jean-Luc"));
|
||||||
|
assert!(is_plausible_name_token("St. James"));
|
||||||
|
assert!(is_plausible_name_token("Renée"));
|
||||||
|
assert!(is_plausible_name_token("José"));
|
||||||
|
// Asian script names — the alphabetic/letter check covers any
|
||||||
|
// script, not just Latin.
|
||||||
|
assert!(is_plausible_name_token("田中太郎"));
|
||||||
|
|
||||||
|
// Below the 3-character floor.
|
||||||
|
assert!(!is_plausible_name_token(""));
|
||||||
|
assert!(!is_plausible_name_token(" "));
|
||||||
|
assert!(!is_plausible_name_token("Bo"));
|
||||||
|
assert!(!is_plausible_name_token("AB"));
|
||||||
|
// Trim before counting — surrounding whitespace doesn't count.
|
||||||
|
assert!(!is_plausible_name_token(" AB "));
|
||||||
|
|
||||||
|
// Emoji / symbol classes get the whole tag dropped.
|
||||||
|
assert!(!is_plausible_name_token("🐱cat"));
|
||||||
|
assert!(!is_plausible_name_token("Heart ❤"));
|
||||||
|
assert!(!is_plausible_name_token("📸Photo"));
|
||||||
|
assert!(!is_plausible_name_token("→ Trip"));
|
||||||
|
assert!(!is_plausible_name_token("★Vacation"));
|
||||||
|
|
||||||
|
// Digits are kept (handled by looks_like_person, not here).
|
||||||
|
assert!(is_plausible_name_token("Trip 2018"));
|
||||||
|
assert!(is_plausible_name_token("2024"));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn looks_like_person_accepts_typical_names() {
|
fn looks_like_person_accepts_typical_names() {
|
||||||
assert!(looks_like_person("Cameron"));
|
assert!(looks_like_person("Cameron"));
|
||||||
|
|||||||
Reference in New Issue
Block a user