Add Google Takeout data import infrastructure

Implements Phase 1 & 2 of Google Takeout RAG integration:
- Database migrations for calendar_events, location_history, search_history
- DAO implementations with hybrid time + semantic search
- Parsers for .ics, JSON, and HTML Google Takeout formats
- Import utilities with batch insert optimization

Features:
- CalendarEventDao: Hybrid time-range + semantic search for events
- LocationHistoryDao: GPS proximity with Haversine distance calculation
- SearchHistoryDao: Semantic-first search (queries are embedding-rich)
- Batch inserts for performance (1M+ records in minutes vs hours)
- OpenTelemetry tracing for all database operations

Import utilities:
- import_calendar: Parse .ics with optional embedding generation
- import_location_history: High-volume GPS data with batch inserts
- import_search_history: Always generates embeddings for semantic search

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Cameron
2026-01-05 14:50:49 -05:00
parent bb23e6bb25
commit d86b2c3746
27 changed files with 3129 additions and 55 deletions

183
src/parsers/ical_parser.rs Normal file
View File

@@ -0,0 +1,183 @@
use anyhow::{Context, Result};
use chrono::NaiveDateTime;
use ical::parser::ical::component::IcalCalendar;
use ical::property::Property;
use std::fs::File;
use std::io::BufReader;
#[derive(Debug, Clone)]
pub struct ParsedCalendarEvent {
pub event_uid: Option<String>,
pub summary: String,
pub description: Option<String>,
pub location: Option<String>,
pub start_time: i64,
pub end_time: i64,
pub all_day: bool,
pub organizer: Option<String>,
pub attendees: Vec<String>,
}
pub fn parse_ics_file(path: &str) -> Result<Vec<ParsedCalendarEvent>> {
let file = File::open(path).context("Failed to open .ics file")?;
let reader = BufReader::new(file);
let parser = ical::IcalParser::new(reader);
let mut events = Vec::new();
for calendar_result in parser {
let calendar: IcalCalendar = calendar_result.context("Failed to parse calendar")?;
for event in calendar.events {
// Extract properties
let mut event_uid = None;
let mut summary = None;
let mut description = None;
let mut location = None;
let mut start_time = None;
let mut end_time = None;
let mut all_day = false;
let mut organizer = None;
let mut attendees = Vec::new();
for property in event.properties {
match property.name.as_str() {
"UID" => {
event_uid = property.value;
}
"SUMMARY" => {
summary = property.value;
}
"DESCRIPTION" => {
description = property.value;
}
"LOCATION" => {
location = property.value;
}
"DTSTART" => {
if let Some(ref value) = property.value {
start_time = parse_ical_datetime(value, &property)?;
// Check if it's an all-day event (no time component)
all_day = value.len() == 8; // YYYYMMDD format
}
}
"DTEND" => {
if let Some(ref value) = property.value {
end_time = parse_ical_datetime(value, &property)?;
}
}
"ORGANIZER" => {
organizer = extract_email_from_mailto(property.value.as_deref());
}
"ATTENDEE" => {
if let Some(email) = extract_email_from_mailto(property.value.as_deref()) {
attendees.push(email);
}
}
_ => {}
}
}
// Only include events with required fields
if let (Some(summary_text), Some(start), Some(end)) = (summary, start_time, end_time) {
events.push(ParsedCalendarEvent {
event_uid,
summary: summary_text,
description,
location,
start_time: start,
end_time: end,
all_day,
organizer,
attendees,
});
}
}
}
Ok(events)
}
fn parse_ical_datetime(value: &str, property: &Property) -> Result<Option<i64>> {
// Check for TZID parameter
let _tzid = property.params.as_ref().and_then(|params| {
params
.iter()
.find(|(key, _)| key == "TZID")
.and_then(|(_, values)| values.first())
.cloned()
});
// iCal datetime formats:
// - 20240815T140000Z (UTC)
// - 20240815T140000 (local/TZID)
// - 20240815 (all-day)
let cleaned = value.replace("Z", "").replace("T", "");
// All-day event (YYYYMMDD)
if cleaned.len() == 8 {
let dt = NaiveDateTime::parse_from_str(&format!("{}000000", cleaned), "%Y%m%d%H%M%S")
.context("Failed to parse all-day date")?;
return Ok(Some(dt.and_utc().timestamp()));
}
// DateTime event (YYYYMMDDTHHMMSS)
if cleaned.len() >= 14 {
let dt = NaiveDateTime::parse_from_str(&cleaned[..14], "%Y%m%d%H%M%S")
.context("Failed to parse datetime")?;
// If original had 'Z', it's UTC
let timestamp = if value.ends_with('Z') {
dt.and_utc().timestamp()
} else {
// Treat as UTC for simplicity (proper TZID handling is complex)
dt.and_utc().timestamp()
};
return Ok(Some(timestamp));
}
Ok(None)
}
fn extract_email_from_mailto(value: Option<&str>) -> Option<String> {
value.and_then(|v| {
// ORGANIZER and ATTENDEE often have format: mailto:user@example.com
if v.starts_with("mailto:") {
Some(v.trim_start_matches("mailto:").to_string())
} else {
Some(v.to_string())
}
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_ical_datetime() {
let prop = Property {
name: "DTSTART".to_string(),
params: None,
value: Some("20240815T140000Z".to_string()),
};
let timestamp = parse_ical_datetime("20240815T140000Z", &prop).unwrap();
assert!(timestamp.is_some());
}
#[test]
fn test_extract_email() {
assert_eq!(
extract_email_from_mailto(Some("mailto:user@example.com")),
Some("user@example.com".to_string())
);
assert_eq!(
extract_email_from_mailto(Some("user@example.com")),
Some("user@example.com".to_string())
);
}
}