Add Google Takeout data import infrastructure
Implements Phase 1 & 2 of Google Takeout RAG integration: - Database migrations for calendar_events, location_history, search_history - DAO implementations with hybrid time + semantic search - Parsers for .ics, JSON, and HTML Google Takeout formats - Import utilities with batch insert optimization Features: - CalendarEventDao: Hybrid time-range + semantic search for events - LocationHistoryDao: GPS proximity with Haversine distance calculation - SearchHistoryDao: Semantic-first search (queries are embedding-rich) - Batch inserts for performance (1M+ records in minutes vs hours) - OpenTelemetry tracing for all database operations Import utilities: - import_calendar: Parse .ics with optional embedding generation - import_location_history: High-volume GPS data with batch inserts - import_search_history: Always generates embeddings for semantic search 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
183
src/parsers/ical_parser.rs
Normal file
183
src/parsers/ical_parser.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::NaiveDateTime;
|
||||
use ical::parser::ical::component::IcalCalendar;
|
||||
use ical::property::Property;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ParsedCalendarEvent {
|
||||
pub event_uid: Option<String>,
|
||||
pub summary: String,
|
||||
pub description: Option<String>,
|
||||
pub location: Option<String>,
|
||||
pub start_time: i64,
|
||||
pub end_time: i64,
|
||||
pub all_day: bool,
|
||||
pub organizer: Option<String>,
|
||||
pub attendees: Vec<String>,
|
||||
}
|
||||
|
||||
pub fn parse_ics_file(path: &str) -> Result<Vec<ParsedCalendarEvent>> {
|
||||
let file = File::open(path).context("Failed to open .ics file")?;
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
let parser = ical::IcalParser::new(reader);
|
||||
let mut events = Vec::new();
|
||||
|
||||
for calendar_result in parser {
|
||||
let calendar: IcalCalendar = calendar_result.context("Failed to parse calendar")?;
|
||||
|
||||
for event in calendar.events {
|
||||
// Extract properties
|
||||
let mut event_uid = None;
|
||||
let mut summary = None;
|
||||
let mut description = None;
|
||||
let mut location = None;
|
||||
let mut start_time = None;
|
||||
let mut end_time = None;
|
||||
let mut all_day = false;
|
||||
let mut organizer = None;
|
||||
let mut attendees = Vec::new();
|
||||
|
||||
for property in event.properties {
|
||||
match property.name.as_str() {
|
||||
"UID" => {
|
||||
event_uid = property.value;
|
||||
}
|
||||
"SUMMARY" => {
|
||||
summary = property.value;
|
||||
}
|
||||
"DESCRIPTION" => {
|
||||
description = property.value;
|
||||
}
|
||||
"LOCATION" => {
|
||||
location = property.value;
|
||||
}
|
||||
"DTSTART" => {
|
||||
if let Some(ref value) = property.value {
|
||||
start_time = parse_ical_datetime(value, &property)?;
|
||||
// Check if it's an all-day event (no time component)
|
||||
all_day = value.len() == 8; // YYYYMMDD format
|
||||
}
|
||||
}
|
||||
"DTEND" => {
|
||||
if let Some(ref value) = property.value {
|
||||
end_time = parse_ical_datetime(value, &property)?;
|
||||
}
|
||||
}
|
||||
"ORGANIZER" => {
|
||||
organizer = extract_email_from_mailto(property.value.as_deref());
|
||||
}
|
||||
"ATTENDEE" => {
|
||||
if let Some(email) = extract_email_from_mailto(property.value.as_deref()) {
|
||||
attendees.push(email);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Only include events with required fields
|
||||
if let (Some(summary_text), Some(start), Some(end)) = (summary, start_time, end_time) {
|
||||
events.push(ParsedCalendarEvent {
|
||||
event_uid,
|
||||
summary: summary_text,
|
||||
description,
|
||||
location,
|
||||
start_time: start,
|
||||
end_time: end,
|
||||
all_day,
|
||||
organizer,
|
||||
attendees,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(events)
|
||||
}
|
||||
|
||||
fn parse_ical_datetime(value: &str, property: &Property) -> Result<Option<i64>> {
|
||||
// Check for TZID parameter
|
||||
let _tzid = property.params.as_ref().and_then(|params| {
|
||||
params
|
||||
.iter()
|
||||
.find(|(key, _)| key == "TZID")
|
||||
.and_then(|(_, values)| values.first())
|
||||
.cloned()
|
||||
});
|
||||
|
||||
// iCal datetime formats:
|
||||
// - 20240815T140000Z (UTC)
|
||||
// - 20240815T140000 (local/TZID)
|
||||
// - 20240815 (all-day)
|
||||
|
||||
let cleaned = value.replace("Z", "").replace("T", "");
|
||||
|
||||
// All-day event (YYYYMMDD)
|
||||
if cleaned.len() == 8 {
|
||||
let dt = NaiveDateTime::parse_from_str(&format!("{}000000", cleaned), "%Y%m%d%H%M%S")
|
||||
.context("Failed to parse all-day date")?;
|
||||
return Ok(Some(dt.and_utc().timestamp()));
|
||||
}
|
||||
|
||||
// DateTime event (YYYYMMDDTHHMMSS)
|
||||
if cleaned.len() >= 14 {
|
||||
let dt = NaiveDateTime::parse_from_str(&cleaned[..14], "%Y%m%d%H%M%S")
|
||||
.context("Failed to parse datetime")?;
|
||||
|
||||
// If original had 'Z', it's UTC
|
||||
let timestamp = if value.ends_with('Z') {
|
||||
dt.and_utc().timestamp()
|
||||
} else {
|
||||
// Treat as UTC for simplicity (proper TZID handling is complex)
|
||||
dt.and_utc().timestamp()
|
||||
};
|
||||
|
||||
return Ok(Some(timestamp));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn extract_email_from_mailto(value: Option<&str>) -> Option<String> {
|
||||
value.and_then(|v| {
|
||||
// ORGANIZER and ATTENDEE often have format: mailto:user@example.com
|
||||
if v.starts_with("mailto:") {
|
||||
Some(v.trim_start_matches("mailto:").to_string())
|
||||
} else {
|
||||
Some(v.to_string())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_ical_datetime() {
|
||||
let prop = Property {
|
||||
name: "DTSTART".to_string(),
|
||||
params: None,
|
||||
value: Some("20240815T140000Z".to_string()),
|
||||
};
|
||||
|
||||
let timestamp = parse_ical_datetime("20240815T140000Z", &prop).unwrap();
|
||||
assert!(timestamp.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_email() {
|
||||
assert_eq!(
|
||||
extract_email_from_mailto(Some("mailto:user@example.com")),
|
||||
Some("user@example.com".to_string())
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
extract_email_from_mailto(Some("user@example.com")),
|
||||
Some("user@example.com".to_string())
|
||||
);
|
||||
}
|
||||
}
|
||||
133
src/parsers/location_json_parser.rs
Normal file
133
src/parsers/location_json_parser.rs
Normal file
@@ -0,0 +1,133 @@
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::DateTime;
|
||||
use serde::Deserialize;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ParsedLocationRecord {
|
||||
pub timestamp: i64,
|
||||
pub latitude: f64,
|
||||
pub longitude: f64,
|
||||
pub accuracy: Option<i32>,
|
||||
pub activity: Option<String>,
|
||||
pub activity_confidence: Option<i32>,
|
||||
}
|
||||
|
||||
// Google Takeout Location History JSON structures
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct LocationHistory {
|
||||
locations: Vec<LocationPoint>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct LocationPoint {
|
||||
timestamp_ms: Option<String>, // Older format
|
||||
timestamp: Option<String>, // Newer format (ISO8601)
|
||||
latitude_e7: Option<i64>,
|
||||
longitude_e7: Option<i64>,
|
||||
accuracy: Option<i32>,
|
||||
activity: Option<Vec<ActivityRecord>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ActivityRecord {
|
||||
activity: Vec<ActivityType>,
|
||||
timestamp_ms: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct ActivityType {
|
||||
#[serde(rename = "type")]
|
||||
activity_type: String,
|
||||
confidence: i32,
|
||||
}
|
||||
|
||||
pub fn parse_location_json(path: &str) -> Result<Vec<ParsedLocationRecord>> {
|
||||
let file = File::open(path).context("Failed to open location JSON file")?;
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
let history: LocationHistory =
|
||||
serde_json::from_reader(reader).context("Failed to parse location history JSON")?;
|
||||
|
||||
let mut records = Vec::new();
|
||||
|
||||
for point in history.locations {
|
||||
// Parse timestamp (try both formats)
|
||||
let timestamp = if let Some(ts_ms) = point.timestamp_ms {
|
||||
// Milliseconds since epoch
|
||||
ts_ms
|
||||
.parse::<i64>()
|
||||
.context("Failed to parse timestamp_ms")?
|
||||
/ 1000
|
||||
} else if let Some(ts_iso) = point.timestamp {
|
||||
// ISO8601 format
|
||||
DateTime::parse_from_rfc3339(&ts_iso)
|
||||
.context("Failed to parse ISO8601 timestamp")?
|
||||
.timestamp()
|
||||
} else {
|
||||
continue; // Skip points without timestamp
|
||||
};
|
||||
|
||||
// Convert E7 format to decimal degrees
|
||||
let latitude = point.latitude_e7.map(|e7| e7 as f64 / 10_000_000.0);
|
||||
let longitude = point.longitude_e7.map(|e7| e7 as f64 / 10_000_000.0);
|
||||
|
||||
// Extract highest-confidence activity
|
||||
let (activity, activity_confidence) = point
|
||||
.activity
|
||||
.as_ref()
|
||||
.and_then(|activities| activities.first())
|
||||
.and_then(|record| {
|
||||
record
|
||||
.activity
|
||||
.iter()
|
||||
.max_by_key(|a| a.confidence)
|
||||
.map(|a| (a.activity_type.clone(), a.confidence))
|
||||
})
|
||||
.unzip();
|
||||
|
||||
if let (Some(lat), Some(lon)) = (latitude, longitude) {
|
||||
records.push(ParsedLocationRecord {
|
||||
timestamp,
|
||||
latitude: lat,
|
||||
longitude: lon,
|
||||
accuracy: point.accuracy,
|
||||
activity,
|
||||
activity_confidence,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_e7_conversion() {
|
||||
let lat_e7 = 374228300_i64;
|
||||
let lat = lat_e7 as f64 / 10_000_000.0;
|
||||
assert!((lat - 37.42283).abs() < 0.00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_sample_json() {
|
||||
let json = r#"{
|
||||
"locations": [
|
||||
{
|
||||
"latitudeE7": 374228300,
|
||||
"longitudeE7": -1221086100,
|
||||
"accuracy": 20,
|
||||
"timestampMs": "1692115200000"
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
|
||||
let history: LocationHistory = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(history.locations.len(), 1);
|
||||
}
|
||||
}
|
||||
7
src/parsers/mod.rs
Normal file
7
src/parsers/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
pub mod ical_parser;
|
||||
pub mod location_json_parser;
|
||||
pub mod search_html_parser;
|
||||
|
||||
pub use ical_parser::{ParsedCalendarEvent, parse_ics_file};
|
||||
pub use location_json_parser::{ParsedLocationRecord, parse_location_json};
|
||||
pub use search_html_parser::{ParsedSearchRecord, parse_search_html};
|
||||
210
src/parsers/search_html_parser.rs
Normal file
210
src/parsers/search_html_parser.rs
Normal file
@@ -0,0 +1,210 @@
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, NaiveDateTime, Utc};
|
||||
use scraper::{Html, Selector};
|
||||
use std::fs;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ParsedSearchRecord {
|
||||
pub timestamp: i64,
|
||||
pub query: String,
|
||||
pub search_engine: Option<String>,
|
||||
}
|
||||
|
||||
pub fn parse_search_html(path: &str) -> Result<Vec<ParsedSearchRecord>> {
|
||||
let html_content =
|
||||
fs::read_to_string(path).context("Failed to read search history HTML file")?;
|
||||
|
||||
let document = Html::parse_document(&html_content);
|
||||
let mut records = Vec::new();
|
||||
|
||||
// Try multiple selector strategies as Google Takeout format varies
|
||||
|
||||
// Strategy 1: Look for specific cell structure
|
||||
if let Ok(cell_selector) = Selector::parse("div.content-cell") {
|
||||
for cell in document.select(&cell_selector) {
|
||||
if let Some(record) = parse_content_cell(&cell) {
|
||||
records.push(record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 2: Look for outer-cell structure (older format)
|
||||
if records.is_empty() {
|
||||
if let Ok(outer_selector) = Selector::parse("div.outer-cell") {
|
||||
for cell in document.select(&outer_selector) {
|
||||
if let Some(record) = parse_outer_cell(&cell) {
|
||||
records.push(record);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Strategy 3: Generic approach - look for links and timestamps
|
||||
if records.is_empty() {
|
||||
if let Ok(link_selector) = Selector::parse("a") {
|
||||
for link in document.select(&link_selector) {
|
||||
if let Some(href) = link.value().attr("href") {
|
||||
// Check if it's a search URL
|
||||
if href.contains("google.com/search?q=") || href.contains("search?q=") {
|
||||
if let Some(query) = extract_query_from_url(href) {
|
||||
// Try to find nearby timestamp
|
||||
let timestamp = find_nearby_timestamp(&link);
|
||||
|
||||
records.push(ParsedSearchRecord {
|
||||
timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
|
||||
query,
|
||||
search_engine: Some("Google".to_string()),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(records)
|
||||
}
|
||||
|
||||
fn parse_content_cell(cell: &scraper::ElementRef) -> Option<ParsedSearchRecord> {
|
||||
let link_selector = Selector::parse("a").ok()?;
|
||||
|
||||
let link = cell.select(&link_selector).next()?;
|
||||
let href = link.value().attr("href")?;
|
||||
let query = extract_query_from_url(href)?;
|
||||
|
||||
// Extract timestamp from cell text
|
||||
let cell_text = cell.text().collect::<Vec<_>>().join(" ");
|
||||
let timestamp = parse_timestamp_from_text(&cell_text);
|
||||
|
||||
Some(ParsedSearchRecord {
|
||||
timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
|
||||
query,
|
||||
search_engine: Some("Google".to_string()),
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_outer_cell(cell: &scraper::ElementRef) -> Option<ParsedSearchRecord> {
|
||||
let link_selector = Selector::parse("a").ok()?;
|
||||
|
||||
let link = cell.select(&link_selector).next()?;
|
||||
let href = link.value().attr("href")?;
|
||||
let query = extract_query_from_url(href)?;
|
||||
|
||||
let cell_text = cell.text().collect::<Vec<_>>().join(" ");
|
||||
let timestamp = parse_timestamp_from_text(&cell_text);
|
||||
|
||||
Some(ParsedSearchRecord {
|
||||
timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
|
||||
query,
|
||||
search_engine: Some("Google".to_string()),
|
||||
})
|
||||
}
|
||||
|
||||
fn extract_query_from_url(url: &str) -> Option<String> {
|
||||
// Extract query parameter from URL
|
||||
// Example: https://www.google.com/search?q=rust+programming
|
||||
|
||||
if let Some(query_start) = url.find("?q=").or_else(|| url.find("&q=")) {
|
||||
let query_part = &url[query_start + 3..];
|
||||
let query_end = query_part.find('&').unwrap_or(query_part.len());
|
||||
let encoded_query = &query_part[..query_end];
|
||||
|
||||
// URL decode
|
||||
urlencoding::decode(encoded_query)
|
||||
.ok()
|
||||
.map(|s| s.to_string())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn find_nearby_timestamp(element: &scraper::ElementRef) -> Option<i64> {
|
||||
// Look for timestamp in parent or sibling elements
|
||||
if let Some(parent) = element.parent() {
|
||||
if parent.value().as_element().is_some() {
|
||||
let parent_ref = scraper::ElementRef::wrap(parent)?;
|
||||
let text = parent_ref.text().collect::<Vec<_>>().join(" ");
|
||||
return parse_timestamp_from_text(&text);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn parse_timestamp_from_text(text: &str) -> Option<i64> {
|
||||
// Google Takeout timestamps often look like:
|
||||
// "Aug 15, 2024, 2:34:56 PM PDT"
|
||||
// "2024-08-15T14:34:56Z"
|
||||
|
||||
// Try ISO8601 first
|
||||
if let Some(iso_match) = text
|
||||
.split_whitespace()
|
||||
.find(|s| s.contains('T') && s.contains('-'))
|
||||
{
|
||||
if let Ok(dt) = DateTime::parse_from_rfc3339(iso_match) {
|
||||
return Some(dt.timestamp());
|
||||
}
|
||||
}
|
||||
|
||||
// Try common date patterns
|
||||
let patterns = [
|
||||
"%b %d, %Y, %I:%M:%S %p", // Aug 15, 2024, 2:34:56 PM
|
||||
"%Y-%m-%d %H:%M:%S", // 2024-08-15 14:34:56
|
||||
"%m/%d/%Y %H:%M:%S", // 08/15/2024 14:34:56
|
||||
];
|
||||
|
||||
for pattern in patterns {
|
||||
// Extract potential date string
|
||||
if let Some(date_part) = extract_date_substring(text) {
|
||||
if let Ok(dt) = NaiveDateTime::parse_from_str(&date_part, pattern) {
|
||||
return Some(dt.and_utc().timestamp());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn extract_date_substring(text: &str) -> Option<String> {
|
||||
// Try to extract date-like substring from text
|
||||
// This is a heuristic approach for varied formats
|
||||
|
||||
// Look for patterns like "Aug 15, 2024, 2:34:56 PM"
|
||||
if let Some(pos) = text.find(|c: char| c.is_numeric()) {
|
||||
let rest = &text[pos..];
|
||||
if let Some(end) =
|
||||
rest.find(|c: char| !c.is_alphanumeric() && c != ':' && c != ',' && c != ' ')
|
||||
{
|
||||
Some(rest[..end].trim().to_string())
|
||||
} else {
|
||||
Some(rest.trim().to_string())
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_extract_query_from_url() {
|
||||
let url = "https://www.google.com/search?q=rust+programming&oq=rust";
|
||||
let query = extract_query_from_url(url);
|
||||
assert_eq!(query, Some("rust programming".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_query_with_encoding() {
|
||||
let url = "https://www.google.com/search?q=hello%20world";
|
||||
let query = extract_query_from_url(url);
|
||||
assert_eq!(query, Some("hello world".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_iso_timestamp() {
|
||||
let text = "Some text 2024-08-15T14:34:56Z more text";
|
||||
let timestamp = parse_timestamp_from_text(text);
|
||||
assert!(timestamp.is_some());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user