ImageApi/migrations/2026-04-29-000000_add_faces/up.sql

-- Local face recognition tables.
--
-- `persons` are visual identities (the "who" of a face). The optional
-- `entity_id` bridges to the existing knowledge graph `entities` table —
-- when set, this person is the visual side of an LLM-extracted entity.
-- Don't auto-create entities from persons; the entity table represents
-- LLM-extracted knowledge with its own confidence semantics, and silently
-- filling it from face detections muddies the provenance.
--
-- `face_detections` carries one row per detected face on a content_hash,
-- plus marker rows with `status='no_faces'` or `status='failed'` so the
-- file watcher knows not to re-scan a hash. Keying on `content_hash`
-- (cross-library dedup) rather than `(library_id, rel_path)` means the
-- same JPEG in two libraries is scanned once. The denormalized `rel_path`
-- carries the most-recently-seen path — useful for cluster-thumb URL
-- generation; canonical path lookup goes through image_exif.

CREATE TABLE persons (
    id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
    name TEXT NOT NULL,
    cover_face_id INTEGER,                  -- backfilled when the first face binds
    entity_id INTEGER,                      -- optional bridge to entities(id)
    created_from_tag BOOLEAN NOT NULL DEFAULT 0,
    notes TEXT,
    created_at BIGINT NOT NULL,
    updated_at BIGINT NOT NULL,
    CONSTRAINT fk_persons_entity FOREIGN KEY (entity_id) REFERENCES entities(id) ON DELETE SET NULL,
    UNIQUE(name COLLATE NOCASE)
);

CREATE INDEX idx_persons_entity ON persons(entity_id);

CREATE TABLE face_detections (
    id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
    library_id INTEGER NOT NULL,
    content_hash TEXT NOT NULL,             -- canonical key (cross-library dedup)
    rel_path TEXT NOT NULL,                 -- denormalized; most recently seen
    bbox_x REAL,                            -- normalized 0..1; NULL on marker rows
    bbox_y REAL,
    bbox_w REAL,
    bbox_h REAL,
    embedding BLOB,                         -- 512×f32 = 2048 bytes; NULL on marker rows
    confidence REAL,                        -- detector score
    source TEXT NOT NULL,                   -- 'auto' | 'manual'
    person_id INTEGER,
    status TEXT NOT NULL DEFAULT 'detected', -- 'detected' | 'no_faces' | 'failed'
    model_version TEXT NOT NULL,            -- e.g. 'buffalo_l'; embedding lineage
    created_at BIGINT NOT NULL,
    CONSTRAINT fk_fd_library FOREIGN KEY (library_id) REFERENCES libraries(id),
    CONSTRAINT fk_fd_person FOREIGN KEY (person_id) REFERENCES persons(id) ON DELETE SET NULL,
    -- Detected rows carry geometry + embedding; marker rows ('no_faces',
    -- 'failed') carry neither. CHECK enforces the invariant so manual
    -- inserts can't slip through with half a row.
    CONSTRAINT chk_marker CHECK (
        (status = 'detected' AND bbox_x IS NOT NULL AND embedding IS NOT NULL)
        OR (status IN ('no_faces','failed') AND bbox_x IS NULL AND embedding IS NULL)
    )
);

CREATE INDEX idx_face_detections_hash       ON face_detections(content_hash);
CREATE INDEX idx_face_detections_lib_path   ON face_detections(library_id, rel_path);
CREATE INDEX idx_face_detections_person     ON face_detections(person_id);
CREATE INDEX idx_face_detections_status     ON face_detections(status);
-- One marker row per (content_hash, status='no_faces') so the file watcher
-- doesn't double-mark when a hash is seen on multiple full-scan passes.
CREATE UNIQUE INDEX idx_face_detections_no_faces_unique
    ON face_detections(content_hash) WHERE status = 'no_faces';