2026-05-10 15:49:19 +00:00
1 changed files with 92 additions and 415 deletions
--- a/src/faces.rs
+++ b/src/faces.rs
@@ -47,7 +47,7 @@ use std::sync::{Arc, Mutex};
 /// Visual identity. The optional `entity_id` bridges this person to an
 /// LLM-extracted knowledge-graph entity (textual side). Persons are NOT
 /// auto-bridged at creation — only when the user explicitly links them in
-/// the management UI, or when bootstrap finds an exact-name match.
+/// the management UI.
 #[derive(Serialize, Queryable, Clone, Debug)]
 pub struct Person {
    pub id: i32,
@@ -366,6 +366,10 @@ pub struct EmbeddingsQuery {
    pub limit: i64,
    #[serde(default)]
    pub offset: i64,
+    /// Restrict to one person's faces. Used by the similar-unassigned
+    /// suggester to fetch a centroid pool. When set, takes precedence
+    /// over `unassigned` (the more specific filter wins).
+    pub person_id: Option<i32>,
 }

 fn default_unassigned() -> bool {
@@ -429,6 +433,7 @@ pub trait FaceDao: Send + Sync {
        ctx: &opentelemetry::Context,
        library_id: Option<i32>,
        unassigned: bool,
+        person_id: Option<i32>,
        limit: i64,
        offset: i64,
    ) -> anyhow::Result<Vec<(FaceDetectionRow, String)>>;
@@ -863,6 +868,7 @@ impl FaceDao for SqliteFaceDao {
        ctx: &opentelemetry::Context,
        library_id: Option<i32>,
        unassigned: bool,
+        person_id: Option<i32>,
        limit: i64,
        offset: i64,
    ) -> anyhow::Result<Vec<(FaceDetectionRow, String)>> {
@@ -876,7 +882,13 @@ impl FaceDao for SqliteFaceDao {
            if let Some(lib) = library_id {
                query = query.filter(face_detections::library_id.eq(lib));
            }
-            if unassigned {
+            // person_id is the more specific filter — when both it and
+            // `unassigned` are supplied, prefer the explicit person id and
+            // ignore the IS NULL constraint (which would always return
+            // empty for an assigned person).
+            if let Some(pid) = person_id {
+                query = query.filter(face_detections::person_id.eq(pid));
+            } else if unassigned {
                query = query.filter(face_detections::person_id.is_null());
            }
            let rows = query
@@ -1676,18 +1688,10 @@ where
                .route(web::get().to(list_persons_handler::<D>))
                .route(web::post().to(create_person_handler::<D>)),
        )
-        .service(
-            web::resource("/persons/bootstrap")
-                .route(web::post().to(bootstrap_persons_handler::<D>)),
-        )
        .service(
            web::resource("/persons/ignore-bucket")
                .route(web::post().to(ignore_bucket_handler::<D>)),
        )
-        .service(
-            web::resource("/tags/people-bootstrap-candidates")
-                .route(web::get().to(bootstrap_candidates_handler::<D>)),
-        )
        .service(
            web::resource("/persons/{id}")
                .route(web::get().to(get_person_handler::<D>))
@@ -1702,340 +1706,6 @@ where
        )
 }

-// ── Bootstrap (Phase 4) ─────────────────────────────────────────────────────
-
-#[derive(Serialize, Debug, Clone)]
-pub struct BootstrapCandidate {
-    /// Display name — most-frequent capitalization across the case-insensitive
-    /// group, or simply the first one seen if it's a tie.
-    pub name: String,
-    /// Lowercased name; the stable key for grouping and the auto-bind path.
-    pub normalized_name: String,
-    /// Sum of `tagged_photo` counts across all capitalizations of this name.
-    pub usage_count: i64,
-    /// Heuristic suggestion; the UI defaults this to checked but the user
-    /// confirms before [`bootstrap_persons_handler`] actually creates rows.
-    pub looks_like_person: bool,
-    /// True when a `persons` row already exists for this name (any case).
-    /// The UI hides these — re-running bootstrap is idempotent so it's fine
-    /// either way, but the noise isn't worth showing.
-    pub already_exists: bool,
-}
-
-#[derive(Serialize, Debug)]
-pub struct BootstrapCandidatesResponse {
-    pub candidates: Vec<BootstrapCandidate>,
-}
-
-#[derive(Deserialize, Debug)]
-pub struct BootstrapPersonsReq {
-    pub names: Vec<String>,
-}
-
-#[derive(Serialize, Debug)]
-pub struct BootstrapPersonsResponse {
-    pub created: Vec<Person>,
-    pub skipped: Vec<BootstrapSkipped>,
-}
-
-#[derive(Serialize, Debug)]
-pub struct BootstrapSkipped {
-    pub name: String,
-    pub reason: String,
-}
-
-/// Hard filter for the bootstrap candidate list. Returns true if the tag
-/// could plausibly be a person name; returns false to drop it from the
-/// candidates entirely (not just leave looks_like_person=false).
-///
-/// Rules — all required:
-/// - At least 3 characters after trimming. Two-letter tags ("AB", "OK")
-///   are almost always abbreviations or markers, not names.
-/// - No emoji or symbol-class characters. SQL-side string sort already
-///   surfaces those at the top of the tag list; filtering them keeps
-///   the candidate UI focused on names rather than chart-junk.
-/// - No control characters or null bytes.
-pub(crate) fn is_plausible_name_token(raw: &str) -> bool {
-    let trimmed = raw.trim();
-    if trimmed.chars().count() < 3 {
-        return false;
-    }
-    for c in trimmed.chars() {
-        // Letter / mark / decimal-digit / connector-punctuation /
-        // dash / apostrophe / period / whitespace are all plausible in a
-        // name. Anything else (emoji, symbols, math operators, arrows,
-        // box drawing, control codes) disqualifies the whole tag.
-        if c.is_alphabetic()
-            || c.is_whitespace()
-            || matches!(c, '\'' | '-' | '.' | '_' | '\u{2019}')
-        {
-            continue;
-        }
-        if c.is_ascii_digit() {
-            // Digits don't disqualify here — `looks_like_person` rejects
-            // them later, but `is_plausible_name_token` is just about
-            // "could this be in the candidate list at all?". A tag like
-            // "Sarah2" stays as a candidate (display-flagged not-a-person
-            // by looks_like_person) so the operator can still spot and
-            // confirm it manually if it's an alias.
-            continue;
-        }
-        return false;
-    }
-    true
-}
-
-/// Conservative "this tag *might* be a person name" heuristic. False
-/// negatives are fine — the operator confirms in the UI before any row
-/// is created. False positives are also fine for the same reason; the
-/// goal is just to default sensible candidates to checked.
-///
-/// Rules:
-/// - 1–2 whitespace-separated words
-/// - Each word starts with an uppercase character
-/// - No digits anywhere (rejects "Trip 2018", "2024", etc.)
-/// - Single-word names not on a small denylist of common non-person
-///   tags (cat, christmas, beach, ...). Two-word names skip the
-///   denylist because a real two-word person name is the dominant
-///   case ("Sarah Smith") and false-blocking it is worse than false-
-///   accepting "Sunset Walk".
-pub(crate) fn looks_like_person(raw: &str) -> bool {
-    let trimmed = raw.trim();
-    if trimmed.is_empty() {
-        return false;
-    }
-    let words: Vec<&str> = trimmed.split_whitespace().collect();
-    if !(1..=2).contains(&words.len()) {
-        return false;
-    }
-    for w in &words {
-        let Some(first) = w.chars().next() else {
-            return false;
-        };
-        if !first.is_uppercase() {
-            return false;
-        }
-        if w.chars().any(|c| c.is_ascii_digit()) {
-            return false;
-        }
-    }
-    if words.len() == 1 {
-        const DENY: &[&str] = &[
-            // Pets / animals
-            "cat",
-            "dog",
-            "kitten",
-            "puppy",
-            "bird",
-            "fish",
-            "pet",
-            "pets",
-            // Events / occasions
-            "birthday",
-            "christmas",
-            "halloween",
-            "easter",
-            "thanksgiving",
-            "wedding",
-            "anniversary",
-            "vacation",
-            "holiday",
-            "party",
-            "trip",
-            "graduation",
-            "concert",
-            // Places (generic)
-            "home",
-            "work",
-            "beach",
-            "park",
-            "hotel",
-            "restaurant",
-            "office",
-            "house",
-            "garden",
-            // Subjects / styles
-            "food",
-            "sunset",
-            "sunrise",
-            "landscape",
-            "portrait",
-            "selfie",
-            "nature",
-            "flowers",
-            "flower",
-            "snow",
-            "rain",
-            "sky",
-            // Buckets
-            "untagged",
-            "favorites",
-            "favourites",
-            "misc",
-            "other",
-            "random",
-        ];
-        let lower = trimmed.to_lowercase();
-        if DENY.iter().any(|w| *w == lower) {
-            return false;
-        }
-    }
-    true
-}
-
-async fn bootstrap_candidates_handler<D: FaceDao>(
-    _: Claims,
-    request: HttpRequest,
-    face_dao: web::Data<Mutex<D>>,
-    tag_dao: web::Data<Mutex<crate::tags::SqliteTagDao>>,
-) -> impl Responder {
-    use std::collections::HashMap;
-    let context = extract_context_from_request(&request);
-    let span = global_tracer().start_with_context("faces.bootstrap_candidates", &context);
-    let span_context = opentelemetry::Context::current_with_span(span);
-
-    // All tags + their counts. Path filter unused — bootstrap is library-wide.
-    let tags_with_counts = {
-        let mut td = tag_dao.lock().expect("tag dao lock");
-        match crate::tags::TagDao::get_all_tags(&mut *td, &span_context, None) {
-            Ok(t) => t,
-            Err(e) => return HttpResponse::InternalServerError().body(format!("{:#}", e)),
-        }
-    };
-
-    // Group by lowercase name. Pick the most-frequent capitalization
-    // for the display name (ties broken by first-seen). Filter out
-    // short tags and tags carrying non-name characters (emojis, symbols)
-    // before grouping — they're noise no operator would tick, so showing
-    // them just makes the candidate list harder to scan.
-    struct Group {
-        display: String,
-        display_freq: i64,
-        total_count: i64,
-    }
-    let mut groups: HashMap<String, Group> = HashMap::new();
-    for (count, tag) in tags_with_counts {
-        if !is_plausible_name_token(&tag.name) {
-            continue;
-        }
-        let lower = tag.name.to_lowercase();
-        let g = groups.entry(lower).or_insert_with(|| Group {
-            display: tag.name.clone(),
-            display_freq: 0,
-            total_count: 0,
-        });
-        g.total_count += count;
-        if count > g.display_freq {
-            g.display = tag.name.clone();
-            g.display_freq = count;
-        }
-    }
-
-    // Cross-reference against existing persons (bulk one-query lookup).
-    let lower_names: Vec<String> = groups.keys().cloned().collect();
-    let existing = {
-        let mut fd = face_dao.lock().expect("face dao lock");
-        match fd.find_persons_by_names_ci(&span_context, &lower_names) {
-            Ok(m) => m,
-            Err(e) => return HttpResponse::InternalServerError().body(format!("{:#}", e)),
-        }
-    };
-
-    let mut candidates: Vec<BootstrapCandidate> = groups
-        .into_iter()
-        .map(|(lower, g)| BootstrapCandidate {
-            looks_like_person: looks_like_person(&g.display),
-            already_exists: existing.contains_key(&lower),
-            name: g.display,
-            normalized_name: lower,
-            usage_count: g.total_count,
-        })
-        .collect();
-    // Sort: persons-first heuristic by descending count, then alphabetical.
-    // Persons-likely candidates surface near the top so the user doesn't
-    // scroll past dozens of "vacation"-style tags to find them.
-    candidates.sort_by(|a, b| {
-        b.looks_like_person
-            .cmp(&a.looks_like_person)
-            .then(b.usage_count.cmp(&a.usage_count))
-            .then(a.normalized_name.cmp(&b.normalized_name))
-    });
-
-    HttpResponse::Ok().json(BootstrapCandidatesResponse { candidates })
-}
-
-async fn bootstrap_persons_handler<D: FaceDao>(
-    _: Claims,
-    request: HttpRequest,
-    body: web::Json<BootstrapPersonsReq>,
-    face_dao: web::Data<Mutex<D>>,
-) -> impl Responder {
-    let context = extract_context_from_request(&request);
-    let span = global_tracer().start_with_context("faces.bootstrap_persons", &context);
-    let span_context = opentelemetry::Context::current_with_span(span);
-
-    let mut created: Vec<Person> = Vec::new();
-    let mut skipped: Vec<BootstrapSkipped> = Vec::new();
-
-    let mut dao = face_dao.lock().expect("face dao lock");
-
-    // Pre-fetch the existing-name set so a duplicate request reports
-    // "already exists" (skipped) rather than firing N inserts that all
-    // 409 against the UNIQUE COLLATE NOCASE constraint.
-    let lower_names: Vec<String> = body.names.iter().map(|n| n.to_lowercase()).collect();
-    let existing = match dao.find_persons_by_names_ci(&span_context, &lower_names) {
-        Ok(m) => m,
-        Err(e) => return HttpResponse::InternalServerError().body(format!("{:#}", e)),
-    };
-
-    for name in &body.names {
-        let trimmed = name.trim();
-        if trimmed.is_empty() {
-            skipped.push(BootstrapSkipped {
-                name: name.clone(),
-                reason: "empty name".into(),
-            });
-            continue;
-        }
-        let lower = trimmed.to_lowercase();
-        if existing.contains_key(&lower) {
-            skipped.push(BootstrapSkipped {
-                name: trimmed.to_string(),
-                reason: "person already exists".into(),
-            });
-            continue;
-        }
-        match dao.create_person(
-            &span_context,
-            &CreatePersonReq {
-                name: trimmed.to_string(),
-                notes: None,
-                entity_id: None,
-                is_ignored: false,
-            },
-            /*from_tag*/ true,
-        ) {
-            Ok(p) => created.push(p),
-            Err(e) => {
-                if is_unique_violation(&e) {
-                    // Race with a concurrent create; treat as skipped.
-                    skipped.push(BootstrapSkipped {
-                        name: trimmed.to_string(),
-                        reason: "person already exists".into(),
-                    });
-                } else {
-                    skipped.push(BootstrapSkipped {
-                        name: trimmed.to_string(),
-                        reason: format!("{:#}", e),
-                    });
-                }
-            }
-        }
-    }
-
-    HttpResponse::Ok().json(BootstrapPersonsResponse { created, skipped })
-}
-
 // ── Stats / list ────────────────────────────────────────────────────────────

 #[derive(Deserialize)]
@@ -2132,6 +1802,7 @@ async fn embeddings_handler<D: FaceDao>(
        &span_context,
        query.library,
        query.unassigned,
+        query.person_id,
        limit,
        offset,
    )
@@ -2796,77 +2467,7 @@ mod tests {
        );
    }

-    // ── Phase 4: bootstrap heuristic + cosine + DAO support ─────────────
-
-    #[test]
-    fn is_plausible_name_token_filters_short_and_emoji() {
-        // Hard filter applied before grouping — emojis and tags shorter
-        // than 3 chars never make it into the candidate list, regardless
-        // of looks_like_person's later assessment.
-        assert!(is_plausible_name_token("Cameron"));
-        assert!(is_plausible_name_token("Sarah Smith"));
-        assert!(is_plausible_name_token("O'Brien"));
-        assert!(is_plausible_name_token("Jean-Luc"));
-        assert!(is_plausible_name_token("St. James"));
-        assert!(is_plausible_name_token("Renée"));
-        assert!(is_plausible_name_token("José"));
-        // Asian script names — the alphabetic/letter check covers any
-        // script, not just Latin.
-        assert!(is_plausible_name_token("田中太郎"));
-
-        // Below the 3-character floor.
-        assert!(!is_plausible_name_token(""));
-        assert!(!is_plausible_name_token(" "));
-        assert!(!is_plausible_name_token("Bo"));
-        assert!(!is_plausible_name_token("AB"));
-        // Trim before counting — surrounding whitespace doesn't count.
-        assert!(!is_plausible_name_token("  AB  "));
-
-        // Emoji / symbol classes get the whole tag dropped.
-        assert!(!is_plausible_name_token("🐱cat"));
-        assert!(!is_plausible_name_token("Heart ❤"));
-        assert!(!is_plausible_name_token("📸Photo"));
-        assert!(!is_plausible_name_token("→ Trip"));
-        assert!(!is_plausible_name_token("★Vacation"));
-
-        // Digits are kept (handled by looks_like_person, not here).
-        assert!(is_plausible_name_token("Trip 2018"));
-        assert!(is_plausible_name_token("2024"));
-    }
-
-    #[test]
-    fn looks_like_person_accepts_typical_names() {
-        assert!(looks_like_person("Cameron"));
-        assert!(looks_like_person("Sarah Smith"));
-        assert!(looks_like_person("Mary Jane"));
-        // Non-ASCII title-cased single word still counts.
-        assert!(looks_like_person("Renée"));
-    }
-
-    #[test]
-    fn looks_like_person_rejects_obvious_non_people() {
-        // Digits, lowercase, three-or-more words, denylist hits.
-        assert!(!looks_like_person("2018"));
-        assert!(!looks_like_person("Trip 2018"));
-        assert!(!looks_like_person("trip"));
-        assert!(!looks_like_person("Birthday Party Cake"));
-        assert!(!looks_like_person("cat"));
-        assert!(!looks_like_person("Cat")); // denied even when title-cased
-        assert!(!looks_like_person("Christmas"));
-        assert!(!looks_like_person("home"));
-        assert!(!looks_like_person(""));
-        assert!(!looks_like_person("   "));
-    }
-
-    #[test]
-    fn looks_like_person_two_words_skips_denylist() {
-        // Two-word names get a pass on the single-word denylist —
-        // "Sunset Walk" is much more likely a real album than a person,
-        // but false-accepting is fine because the operator confirms.
-        // What matters is we don't false-reject "Sarah Smith".
-        assert!(looks_like_person("Sunset Walk"));
-        assert!(looks_like_person("Sarah Smith"));
-    }
+    // ── Phase 4: cosine + DAO support ───────────────────────────────────

    #[test]
    fn cosine_similarity_known_vectors() {
@@ -3339,6 +2940,82 @@ mod tests {
        assert_eq!(faces[0].person_id, Some(alice.id));
    }

+    #[test]
+    fn list_embeddings_filters_by_person_id() {
+        // Apollo's similar-unassigned suggester relies on this filter to
+        // pull a single person's embeddings without paging the whole
+        // detected set client-side. When person_id is set it must win
+        // over `unassigned=true` (otherwise the IS NULL constraint would
+        // always return an empty set for an assigned person).
+        let mut dao = fresh_dao();
+        diesel::sql_query(
+            "INSERT OR IGNORE INTO libraries (id, name, root_path, created_at) \
+             VALUES (1, 'main', '/tmp', 0)",
+        )
+        .execute(dao.connection.lock().unwrap().deref_mut())
+        .expect("seed libraries");
+
+        let alice = dao
+            .create_person(
+                &ctx(),
+                &CreatePersonReq {
+                    name: "Alice".into(),
+                    notes: None,
+                    entity_id: None,
+                    is_ignored: false,
+                },
+                false,
+            )
+            .unwrap();
+        let bob = dao
+            .create_person(
+                &ctx(),
+                &CreatePersonReq {
+                    name: "Bob".into(),
+                    notes: None,
+                    entity_id: None,
+                    is_ignored: false,
+                },
+                false,
+            )
+            .unwrap();
+
+        let mk_row = |hash: &str, person: Option<i32>| InsertFaceDetectionInput {
+            library_id: 1,
+            content_hash: hash.into(),
+            rel_path: format!("{hash}.jpg"),
+            bbox: Some((0.1, 0.1, 0.2, 0.2)),
+            embedding: Some(vec![0u8; 2048]),
+            confidence: Some(0.9),
+            source: "auto".into(),
+            person_id: person,
+            status: "detected".into(),
+            model_version: "buffalo_l".into(),
+        };
+        dao.store_detection(&ctx(), mk_row("a1", Some(alice.id))).unwrap();
+        dao.store_detection(&ctx(), mk_row("a2", Some(alice.id))).unwrap();
+        dao.store_detection(&ctx(), mk_row("b1", Some(bob.id))).unwrap();
+        dao.store_detection(&ctx(), mk_row("u1", None)).unwrap();
+
+        // person_id=alice returns only alice's two faces — ignoring the
+        // (default-true) `unassigned` filter, which would have selected
+        // u1 only.
+        let alice_rows = dao
+            .list_embeddings(&ctx(), None, true, Some(alice.id), 100, 0)
+            .unwrap();
+        assert_eq!(alice_rows.len(), 2);
+        assert!(alice_rows
+            .iter()
+            .all(|(r, _)| r.person_id == Some(alice.id)));
+
+        // unassigned=true with no person_id behaves as before.
+        let unassigned_rows = dao
+            .list_embeddings(&ctx(), None, true, None, 100, 0)
+            .unwrap();
+        assert_eq!(unassigned_rows.len(), 1);
+        assert_eq!(unassigned_rows[0].0.content_hash, "u1");
+    }
+
    // ── crop_image_to_bbox ──────────────────────────────────────────────
    // Pure helper used by the manual face-create handler. Generate a tiny
    // image in memory, write it to a temp file, then exercise the bbox