use anyhow::{Context, Result}; use chrono::{DateTime, NaiveDateTime, Utc}; use scraper::{Html, Selector}; use std::fs; #[derive(Debug, Clone)] pub struct ParsedSearchRecord { pub timestamp: i64, pub query: String, pub search_engine: Option, } pub fn parse_search_html(path: &str) -> Result> { let html_content = fs::read_to_string(path).context("Failed to read search history HTML file")?; let document = Html::parse_document(&html_content); let mut records = Vec::new(); // Try multiple selector strategies as Google Takeout format varies // Strategy 1: Look for specific cell structure if let Ok(cell_selector) = Selector::parse("div.content-cell") { for cell in document.select(&cell_selector) { if let Some(record) = parse_content_cell(&cell) { records.push(record); } } } // Strategy 2: Look for outer-cell structure (older format) if records.is_empty() && let Ok(outer_selector) = Selector::parse("div.outer-cell") { for cell in document.select(&outer_selector) { if let Some(record) = parse_outer_cell(&cell) { records.push(record); } } } // Strategy 3: Generic approach - look for links and timestamps if records.is_empty() && let Ok(link_selector) = Selector::parse("a") { for link in document.select(&link_selector) { if let Some(href) = link.value().attr("href") { // Check if it's a search URL if (href.contains("google.com/search?q=") || href.contains("search?q=")) && let Some(query) = extract_query_from_url(href) { // Try to find nearby timestamp let timestamp = find_nearby_timestamp(&link); records.push(ParsedSearchRecord { timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()), query, search_engine: Some("Google".to_string()), }); } } } } Ok(records) } fn parse_content_cell(cell: &scraper::ElementRef) -> Option { let link_selector = Selector::parse("a").ok()?; let link = cell.select(&link_selector).next()?; let href = link.value().attr("href")?; let query = extract_query_from_url(href)?; // Extract timestamp from cell text let cell_text = cell.text().collect::>().join(" "); let timestamp = parse_timestamp_from_text(&cell_text); Some(ParsedSearchRecord { timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()), query, search_engine: Some("Google".to_string()), }) } fn parse_outer_cell(cell: &scraper::ElementRef) -> Option { let link_selector = Selector::parse("a").ok()?; let link = cell.select(&link_selector).next()?; let href = link.value().attr("href")?; let query = extract_query_from_url(href)?; let cell_text = cell.text().collect::>().join(" "); let timestamp = parse_timestamp_from_text(&cell_text); Some(ParsedSearchRecord { timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()), query, search_engine: Some("Google".to_string()), }) } fn extract_query_from_url(url: &str) -> Option { // Extract query parameter from URL // Example: https://www.google.com/search?q=rust+programming if let Some(query_start) = url.find("?q=").or_else(|| url.find("&q=")) { let query_part = &url[query_start + 3..]; let query_end = query_part.find('&').unwrap_or(query_part.len()); let encoded_query = &query_part[..query_end]; // URL decode urlencoding::decode(encoded_query) .ok() .map(|s| s.to_string()) } else { None } } fn find_nearby_timestamp(element: &scraper::ElementRef) -> Option { // Look for timestamp in parent or sibling elements if let Some(parent) = element.parent() && parent.value().as_element().is_some() { let parent_ref = scraper::ElementRef::wrap(parent)?; let text = parent_ref.text().collect::>().join(" "); return parse_timestamp_from_text(&text); } None } fn parse_timestamp_from_text(text: &str) -> Option { // Google Takeout timestamps often look like: // "Aug 15, 2024, 2:34:56 PM PDT" // "2024-08-15T14:34:56Z" // Try ISO8601 first if let Some(iso_match) = text .split_whitespace() .find(|s| s.contains('T') && s.contains('-')) && let Ok(dt) = DateTime::parse_from_rfc3339(iso_match) { return Some(dt.timestamp()); } // Try common date patterns let patterns = [ "%b %d, %Y, %I:%M:%S %p", // Aug 15, 2024, 2:34:56 PM "%Y-%m-%d %H:%M:%S", // 2024-08-15 14:34:56 "%m/%d/%Y %H:%M:%S", // 08/15/2024 14:34:56 ]; for pattern in patterns { // Extract potential date string if let Some(date_part) = extract_date_substring(text) && let Ok(dt) = NaiveDateTime::parse_from_str(&date_part, pattern) { return Some(dt.and_utc().timestamp()); } } None } fn extract_date_substring(text: &str) -> Option { // Try to extract date-like substring from text // This is a heuristic approach for varied formats // Look for patterns like "Aug 15, 2024, 2:34:56 PM" if let Some(pos) = text.find(|c: char| c.is_numeric()) { let rest = &text[pos..]; if let Some(end) = rest.find(|c: char| !c.is_alphanumeric() && c != ':' && c != ',' && c != ' ') { Some(rest[..end].trim().to_string()) } else { Some(rest.trim().to_string()) } } else { None } } #[cfg(test)] mod tests { use super::*; #[test] fn test_extract_query_from_url() { let url = "https://www.google.com/search?q=rust+programming&oq=rust"; let query = extract_query_from_url(url); assert_eq!(query, Some("rust+programming".to_string())); } #[test] fn test_extract_query_with_encoding() { let url = "https://www.google.com/search?q=hello%20world"; let query = extract_query_from_url(url); assert_eq!(query, Some("hello world".to_string())); } #[test] fn test_parse_iso_timestamp() { let text = "Some text 2024-08-15T14:34:56Z more text"; let timestamp = parse_timestamp_from_text(text); assert!(timestamp.is_some()); } }