From fb078b49063df63fe902c3911c9b6c258c11a493 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Mon, 11 May 2026 21:42:51 -0400 Subject: [PATCH] knowledge: normalize legacy entity_type values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One-shot migration that re-applies the synonym map from `normalize_entity_type` over every existing row, so legacy entries written before that helper landed in upsert_entity stop needing client-side workarounds. person ← person | people | human | individual | contact place ← place | location | venue | site | area | landmark event ← event | occasion | activity | celebration thing ← thing | object | item | product Unknown types ("friend", "family", etc.) get a lowercase+trim sweep so at minimum case variants collapse — the curator can merge or rename them via the curation UI from there. `UPDATE OR IGNORE` skips rows that would violate UNIQUE(name, entity_type) after the rewrite (e.g. an existing ("Sarah", "person") + ("Sarah", "Person") pair). The duplicate survives unchanged so it can be merged through the normal curation flow rather than silently disappearing. Idempotent: every UPDATE is conditional on `entity_type != canonical`, so re-running the migration is a no-op. The down migration is intentionally inert — we don't have per-row history of the original strings and the rewritten values stay semantically correct. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../down.sql | 6 +++ .../up.sql | 43 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 migrations/2026-05-11-000000_normalize_entity_types/down.sql create mode 100644 migrations/2026-05-11-000000_normalize_entity_types/up.sql diff --git a/migrations/2026-05-11-000000_normalize_entity_types/down.sql b/migrations/2026-05-11-000000_normalize_entity_types/down.sql new file mode 100644 index 0000000..4c1a2f2 --- /dev/null +++ b/migrations/2026-05-11-000000_normalize_entity_types/down.sql @@ -0,0 +1,6 @@ +-- Irreversible: we collapsed multiple raw entity_type strings to +-- canonical forms and don't have a per-row record of the original. +-- The down migration is intentionally a no-op (the rewritten values +-- are still semantically correct), and the up migration is safe to +-- re-run because every UPDATE is conditional on `!= canonical`. +SELECT 1; diff --git a/migrations/2026-05-11-000000_normalize_entity_types/up.sql b/migrations/2026-05-11-000000_normalize_entity_types/up.sql new file mode 100644 index 0000000..def6ab4 --- /dev/null +++ b/migrations/2026-05-11-000000_normalize_entity_types/up.sql @@ -0,0 +1,43 @@ +-- Canonicalize `entities.entity_type` so legacy rows from before +-- `normalize_entity_type` landed in upsert_entity stop polluting +-- client-side filters. Mirrors the synonym map in +-- `src/database/knowledge_dao.rs::normalize_entity_type`: +-- person ← person | people | human | individual | contact +-- place ← place | location | venue | site | area | landmark +-- event ← event | occasion | activity | celebration +-- thing ← thing | object | item | product +-- Types outside the synonym set (e.g. "friend", "family") are not +-- recognized as canonical and get a lowercase+trim pass instead, so +-- at minimum case variants collapse. +-- +-- `UPDATE OR IGNORE` skips rows that would violate UNIQUE(name, +-- entity_type) after the rewrite. Two rows like ("Sarah", "person") +-- + ("Sarah", "Person") would otherwise collide — the duplicate +-- survives unchanged so the curator can merge it via the curation +-- UI rather than have the migration silently delete data. + +UPDATE OR IGNORE entities +SET entity_type = 'person' +WHERE LOWER(TRIM(entity_type)) IN ('person', 'people', 'human', 'individual', 'contact') + AND entity_type != 'person'; + +UPDATE OR IGNORE entities +SET entity_type = 'place' +WHERE LOWER(TRIM(entity_type)) IN ('place', 'location', 'venue', 'site', 'area', 'landmark') + AND entity_type != 'place'; + +UPDATE OR IGNORE entities +SET entity_type = 'event' +WHERE LOWER(TRIM(entity_type)) IN ('event', 'occasion', 'activity', 'celebration') + AND entity_type != 'event'; + +UPDATE OR IGNORE entities +SET entity_type = 'thing' +WHERE LOWER(TRIM(entity_type)) IN ('thing', 'object', 'item', 'product') + AND entity_type != 'thing'; + +-- Anything left ("Friend" vs "friend") gets a lowercase+trim sweep +-- so at least case variants of the same custom type collapse. +UPDATE OR IGNORE entities +SET entity_type = LOWER(TRIM(entity_type)) +WHERE entity_type != LOWER(TRIM(entity_type));