Add Google Takeout data import infrastructure

Implements Phase 1 & 2 of Google Takeout RAG integration: - Database migrations for calendar_events, location_history, search_history - DAO implementations with hybrid time + semantic search - Parsers for .ics, JSON, and HTML Google Takeout formats - Import utilities with batch insert optimization Features: - CalendarEventDao: Hybrid time-range + semantic search for events - LocationHistoryDao: GPS proximity with Haversine distance calculation - SearchHistoryDao: Semantic-first search (queries are embedding-rich) - Batch inserts for performance (1M+ records in minutes vs hours) - OpenTelemetry tracing for all database operations Import utilities: - import_calendar: Parse .ics with optional embedding generation - import_location_history: High-volume GPS data with batch inserts - import_search_history: Always generates embeddings for semantic search 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-05 14:50:49 -05:00
parent bb23e6bb25
commit d86b2c3746
27 changed files with 3129 additions and 55 deletions
--- a/src/parsers/ical_parser.rs
+++ b/src/parsers/ical_parser.rs
@@ -0,0 +1,183 @@
+use anyhow::{Context, Result};
+use chrono::NaiveDateTime;
+use ical::parser::ical::component::IcalCalendar;
+use ical::property::Property;
+use std::fs::File;
+use std::io::BufReader;
+
+#[derive(Debug, Clone)]
+pub struct ParsedCalendarEvent {
+    pub event_uid: Option<String>,
+    pub summary: String,
+    pub description: Option<String>,
+    pub location: Option<String>,
+    pub start_time: i64,
+    pub end_time: i64,
+    pub all_day: bool,
+    pub organizer: Option<String>,
+    pub attendees: Vec<String>,
+}
+
+pub fn parse_ics_file(path: &str) -> Result<Vec<ParsedCalendarEvent>> {
+    let file = File::open(path).context("Failed to open .ics file")?;
+    let reader = BufReader::new(file);
+
+    let parser = ical::IcalParser::new(reader);
+    let mut events = Vec::new();
+
+    for calendar_result in parser {
+        let calendar: IcalCalendar = calendar_result.context("Failed to parse calendar")?;
+
+        for event in calendar.events {
+            // Extract properties
+            let mut event_uid = None;
+            let mut summary = None;
+            let mut description = None;
+            let mut location = None;
+            let mut start_time = None;
+            let mut end_time = None;
+            let mut all_day = false;
+            let mut organizer = None;
+            let mut attendees = Vec::new();
+
+            for property in event.properties {
+                match property.name.as_str() {
+                    "UID" => {
+                        event_uid = property.value;
+                    }
+                    "SUMMARY" => {
+                        summary = property.value;
+                    }
+                    "DESCRIPTION" => {
+                        description = property.value;
+                    }
+                    "LOCATION" => {
+                        location = property.value;
+                    }
+                    "DTSTART" => {
+                        if let Some(ref value) = property.value {
+                            start_time = parse_ical_datetime(value, &property)?;
+                            // Check if it's an all-day event (no time component)
+                            all_day = value.len() == 8; // YYYYMMDD format
+                        }
+                    }
+                    "DTEND" => {
+                        if let Some(ref value) = property.value {
+                            end_time = parse_ical_datetime(value, &property)?;
+                        }
+                    }
+                    "ORGANIZER" => {
+                        organizer = extract_email_from_mailto(property.value.as_deref());
+                    }
+                    "ATTENDEE" => {
+                        if let Some(email) = extract_email_from_mailto(property.value.as_deref()) {
+                            attendees.push(email);
+                        }
+                    }
+                    _ => {}
+                }
+            }
+
+            // Only include events with required fields
+            if let (Some(summary_text), Some(start), Some(end)) = (summary, start_time, end_time) {
+                events.push(ParsedCalendarEvent {
+                    event_uid,
+                    summary: summary_text,
+                    description,
+                    location,
+                    start_time: start,
+                    end_time: end,
+                    all_day,
+                    organizer,
+                    attendees,
+                });
+            }
+        }
+    }
+
+    Ok(events)
+}
+
+fn parse_ical_datetime(value: &str, property: &Property) -> Result<Option<i64>> {
+    // Check for TZID parameter
+    let _tzid = property.params.as_ref().and_then(|params| {
+        params
+            .iter()
+            .find(|(key, _)| key == "TZID")
+            .and_then(|(_, values)| values.first())
+            .cloned()
+    });
+
+    // iCal datetime formats:
+    // - 20240815T140000Z (UTC)
+    // - 20240815T140000 (local/TZID)
+    // - 20240815 (all-day)
+
+    let cleaned = value.replace("Z", "").replace("T", "");
+
+    // All-day event (YYYYMMDD)
+    if cleaned.len() == 8 {
+        let dt = NaiveDateTime::parse_from_str(&format!("{}000000", cleaned), "%Y%m%d%H%M%S")
+            .context("Failed to parse all-day date")?;
+        return Ok(Some(dt.and_utc().timestamp()));
+    }
+
+    // DateTime event (YYYYMMDDTHHMMSS)
+    if cleaned.len() >= 14 {
+        let dt = NaiveDateTime::parse_from_str(&cleaned[..14], "%Y%m%d%H%M%S")
+            .context("Failed to parse datetime")?;
+
+        // If original had 'Z', it's UTC
+        let timestamp = if value.ends_with('Z') {
+            dt.and_utc().timestamp()
+        } else {
+            // Treat as UTC for simplicity (proper TZID handling is complex)
+            dt.and_utc().timestamp()
+        };
+
+        return Ok(Some(timestamp));
+    }
+
+    Ok(None)
+}
+
+fn extract_email_from_mailto(value: Option<&str>) -> Option<String> {
+    value.and_then(|v| {
+        // ORGANIZER and ATTENDEE often have format: mailto:user@example.com
+        if v.starts_with("mailto:") {
+            Some(v.trim_start_matches("mailto:").to_string())
+        } else {
+            Some(v.to_string())
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_ical_datetime() {
+        let prop = Property {
+            name: "DTSTART".to_string(),
+            params: None,
+            value: Some("20240815T140000Z".to_string()),
+        };
+
+        let timestamp = parse_ical_datetime("20240815T140000Z", &prop).unwrap();
+        assert!(timestamp.is_some());
+    }
+
+    #[test]
+    fn test_extract_email() {
+        assert_eq!(
+            extract_email_from_mailto(Some("mailto:user@example.com")),
+            Some("user@example.com".to_string())
+        );
+
+        assert_eq!(
+            extract_email_from_mailto(Some("user@example.com")),
+            Some("user@example.com".to_string())
+        );
+    }
+}
--- a/src/parsers/location_json_parser.rs
+++ b/src/parsers/location_json_parser.rs
@@ -0,0 +1,133 @@
+use anyhow::{Context, Result};
+use chrono::DateTime;
+use serde::Deserialize;
+use std::fs::File;
+use std::io::BufReader;
+
+#[derive(Debug, Clone)]
+pub struct ParsedLocationRecord {
+    pub timestamp: i64,
+    pub latitude: f64,
+    pub longitude: f64,
+    pub accuracy: Option<i32>,
+    pub activity: Option<String>,
+    pub activity_confidence: Option<i32>,
+}
+
+// Google Takeout Location History JSON structures
+#[derive(Debug, Deserialize)]
+struct LocationHistory {
+    locations: Vec<LocationPoint>,
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct LocationPoint {
+    timestamp_ms: Option<String>, // Older format
+    timestamp: Option<String>,    // Newer format (ISO8601)
+    latitude_e7: Option<i64>,
+    longitude_e7: Option<i64>,
+    accuracy: Option<i32>,
+    activity: Option<Vec<ActivityRecord>>,
+}
+
+#[derive(Debug, Deserialize)]
+struct ActivityRecord {
+    activity: Vec<ActivityType>,
+    timestamp_ms: Option<String>,
+}
+
+#[derive(Debug, Deserialize)]
+struct ActivityType {
+    #[serde(rename = "type")]
+    activity_type: String,
+    confidence: i32,
+}
+
+pub fn parse_location_json(path: &str) -> Result<Vec<ParsedLocationRecord>> {
+    let file = File::open(path).context("Failed to open location JSON file")?;
+    let reader = BufReader::new(file);
+
+    let history: LocationHistory =
+        serde_json::from_reader(reader).context("Failed to parse location history JSON")?;
+
+    let mut records = Vec::new();
+
+    for point in history.locations {
+        // Parse timestamp (try both formats)
+        let timestamp = if let Some(ts_ms) = point.timestamp_ms {
+            // Milliseconds since epoch
+            ts_ms
+                .parse::<i64>()
+                .context("Failed to parse timestamp_ms")?
+                / 1000
+        } else if let Some(ts_iso) = point.timestamp {
+            // ISO8601 format
+            DateTime::parse_from_rfc3339(&ts_iso)
+                .context("Failed to parse ISO8601 timestamp")?
+                .timestamp()
+        } else {
+            continue; // Skip points without timestamp
+        };
+
+        // Convert E7 format to decimal degrees
+        let latitude = point.latitude_e7.map(|e7| e7 as f64 / 10_000_000.0);
+        let longitude = point.longitude_e7.map(|e7| e7 as f64 / 10_000_000.0);
+
+        // Extract highest-confidence activity
+        let (activity, activity_confidence) = point
+            .activity
+            .as_ref()
+            .and_then(|activities| activities.first())
+            .and_then(|record| {
+                record
+                    .activity
+                    .iter()
+                    .max_by_key(|a| a.confidence)
+                    .map(|a| (a.activity_type.clone(), a.confidence))
+            })
+            .unzip();
+
+        if let (Some(lat), Some(lon)) = (latitude, longitude) {
+            records.push(ParsedLocationRecord {
+                timestamp,
+                latitude: lat,
+                longitude: lon,
+                accuracy: point.accuracy,
+                activity,
+                activity_confidence,
+            });
+        }
+    }
+
+    Ok(records)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_e7_conversion() {
+        let lat_e7 = 374228300_i64;
+        let lat = lat_e7 as f64 / 10_000_000.0;
+        assert!((lat - 37.42283).abs() < 0.00001);
+    }
+
+    #[test]
+    fn test_parse_sample_json() {
+        let json = r#"{
+            "locations": [
+                {
+                    "latitudeE7": 374228300,
+                    "longitudeE7": -1221086100,
+                    "accuracy": 20,
+                    "timestampMs": "1692115200000"
+                }
+            ]
+        }"#;
+
+        let history: LocationHistory = serde_json::from_str(json).unwrap();
+        assert_eq!(history.locations.len(), 1);
+    }
+}
--- a/src/parsers/mod.rs
+++ b/src/parsers/mod.rs
@@ -0,0 +1,7 @@
+pub mod ical_parser;
+pub mod location_json_parser;
+pub mod search_html_parser;
+
+pub use ical_parser::{ParsedCalendarEvent, parse_ics_file};
+pub use location_json_parser::{ParsedLocationRecord, parse_location_json};
+pub use search_html_parser::{ParsedSearchRecord, parse_search_html};
--- a/src/parsers/search_html_parser.rs
+++ b/src/parsers/search_html_parser.rs
@@ -0,0 +1,210 @@
+use anyhow::{Context, Result};
+use chrono::{DateTime, NaiveDateTime, Utc};
+use scraper::{Html, Selector};
+use std::fs;
+
+#[derive(Debug, Clone)]
+pub struct ParsedSearchRecord {
+    pub timestamp: i64,
+    pub query: String,
+    pub search_engine: Option<String>,
+}
+
+pub fn parse_search_html(path: &str) -> Result<Vec<ParsedSearchRecord>> {
+    let html_content =
+        fs::read_to_string(path).context("Failed to read search history HTML file")?;
+
+    let document = Html::parse_document(&html_content);
+    let mut records = Vec::new();
+
+    // Try multiple selector strategies as Google Takeout format varies
+
+    // Strategy 1: Look for specific cell structure
+    if let Ok(cell_selector) = Selector::parse("div.content-cell") {
+        for cell in document.select(&cell_selector) {
+            if let Some(record) = parse_content_cell(&cell) {
+                records.push(record);
+            }
+        }
+    }
+
+    // Strategy 2: Look for outer-cell structure (older format)
+    if records.is_empty() {
+        if let Ok(outer_selector) = Selector::parse("div.outer-cell") {
+            for cell in document.select(&outer_selector) {
+                if let Some(record) = parse_outer_cell(&cell) {
+                    records.push(record);
+                }
+            }
+        }
+    }
+
+    // Strategy 3: Generic approach - look for links and timestamps
+    if records.is_empty() {
+        if let Ok(link_selector) = Selector::parse("a") {
+            for link in document.select(&link_selector) {
+                if let Some(href) = link.value().attr("href") {
+                    // Check if it's a search URL
+                    if href.contains("google.com/search?q=") || href.contains("search?q=") {
+                        if let Some(query) = extract_query_from_url(href) {
+                            // Try to find nearby timestamp
+                            let timestamp = find_nearby_timestamp(&link);
+
+                            records.push(ParsedSearchRecord {
+                                timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
+                                query,
+                                search_engine: Some("Google".to_string()),
+                            });
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(records)
+}
+
+fn parse_content_cell(cell: &scraper::ElementRef) -> Option<ParsedSearchRecord> {
+    let link_selector = Selector::parse("a").ok()?;
+
+    let link = cell.select(&link_selector).next()?;
+    let href = link.value().attr("href")?;
+    let query = extract_query_from_url(href)?;
+
+    // Extract timestamp from cell text
+    let cell_text = cell.text().collect::<Vec<_>>().join(" ");
+    let timestamp = parse_timestamp_from_text(&cell_text);
+
+    Some(ParsedSearchRecord {
+        timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
+        query,
+        search_engine: Some("Google".to_string()),
+    })
+}
+
+fn parse_outer_cell(cell: &scraper::ElementRef) -> Option<ParsedSearchRecord> {
+    let link_selector = Selector::parse("a").ok()?;
+
+    let link = cell.select(&link_selector).next()?;
+    let href = link.value().attr("href")?;
+    let query = extract_query_from_url(href)?;
+
+    let cell_text = cell.text().collect::<Vec<_>>().join(" ");
+    let timestamp = parse_timestamp_from_text(&cell_text);
+
+    Some(ParsedSearchRecord {
+        timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
+        query,
+        search_engine: Some("Google".to_string()),
+    })
+}
+
+fn extract_query_from_url(url: &str) -> Option<String> {
+    // Extract query parameter from URL
+    // Example: https://www.google.com/search?q=rust+programming
+
+    if let Some(query_start) = url.find("?q=").or_else(|| url.find("&q=")) {
+        let query_part = &url[query_start + 3..];
+        let query_end = query_part.find('&').unwrap_or(query_part.len());
+        let encoded_query = &query_part[..query_end];
+
+        // URL decode
+        urlencoding::decode(encoded_query)
+            .ok()
+            .map(|s| s.to_string())
+    } else {
+        None
+    }
+}
+
+fn find_nearby_timestamp(element: &scraper::ElementRef) -> Option<i64> {
+    // Look for timestamp in parent or sibling elements
+    if let Some(parent) = element.parent() {
+        if parent.value().as_element().is_some() {
+            let parent_ref = scraper::ElementRef::wrap(parent)?;
+            let text = parent_ref.text().collect::<Vec<_>>().join(" ");
+            return parse_timestamp_from_text(&text);
+        }
+    }
+    None
+}
+
+fn parse_timestamp_from_text(text: &str) -> Option<i64> {
+    // Google Takeout timestamps often look like:
+    // "Aug 15, 2024, 2:34:56 PM PDT"
+    // "2024-08-15T14:34:56Z"
+
+    // Try ISO8601 first
+    if let Some(iso_match) = text
+        .split_whitespace()
+        .find(|s| s.contains('T') && s.contains('-'))
+    {
+        if let Ok(dt) = DateTime::parse_from_rfc3339(iso_match) {
+            return Some(dt.timestamp());
+        }
+    }
+
+    // Try common date patterns
+    let patterns = [
+        "%b %d, %Y, %I:%M:%S %p", // Aug 15, 2024, 2:34:56 PM
+        "%Y-%m-%d %H:%M:%S",      // 2024-08-15 14:34:56
+        "%m/%d/%Y %H:%M:%S",      // 08/15/2024 14:34:56
+    ];
+
+    for pattern in patterns {
+        // Extract potential date string
+        if let Some(date_part) = extract_date_substring(text) {
+            if let Ok(dt) = NaiveDateTime::parse_from_str(&date_part, pattern) {
+                return Some(dt.and_utc().timestamp());
+            }
+        }
+    }
+
+    None
+}
+
+fn extract_date_substring(text: &str) -> Option<String> {
+    // Try to extract date-like substring from text
+    // This is a heuristic approach for varied formats
+
+    // Look for patterns like "Aug 15, 2024, 2:34:56 PM"
+    if let Some(pos) = text.find(|c: char| c.is_numeric()) {
+        let rest = &text[pos..];
+        if let Some(end) =
+            rest.find(|c: char| !c.is_alphanumeric() && c != ':' && c != ',' && c != ' ')
+        {
+            Some(rest[..end].trim().to_string())
+        } else {
+            Some(rest.trim().to_string())
+        }
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_extract_query_from_url() {
+        let url = "https://www.google.com/search?q=rust+programming&oq=rust";
+        let query = extract_query_from_url(url);
+        assert_eq!(query, Some("rust programming".to_string()));
+    }
+
+    #[test]
+    fn test_extract_query_with_encoding() {
+        let url = "https://www.google.com/search?q=hello%20world";
+        let query = extract_query_from_url(url);
+        assert_eq!(query, Some("hello world".to_string()));
+    }
+
+    #[test]
+    fn test_parse_iso_timestamp() {
+        let text = "Some text 2024-08-15T14:34:56Z more text";
+        let timestamp = parse_timestamp_from_text(text);
+        assert!(timestamp.is_some());
+    }
+}