210 lines
6.6 KiB
Rust
210 lines
6.6 KiB
Rust
use anyhow::{Context, Result};
|
|
use chrono::{DateTime, NaiveDateTime, Utc};
|
|
use scraper::{Html, Selector};
|
|
use std::fs;
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct ParsedSearchRecord {
|
|
pub timestamp: i64,
|
|
pub query: String,
|
|
pub search_engine: Option<String>,
|
|
}
|
|
|
|
pub fn parse_search_html(path: &str) -> Result<Vec<ParsedSearchRecord>> {
|
|
let html_content =
|
|
fs::read_to_string(path).context("Failed to read search history HTML file")?;
|
|
|
|
let document = Html::parse_document(&html_content);
|
|
let mut records = Vec::new();
|
|
|
|
// Try multiple selector strategies as Google Takeout format varies
|
|
|
|
// Strategy 1: Look for specific cell structure
|
|
if let Ok(cell_selector) = Selector::parse("div.content-cell") {
|
|
for cell in document.select(&cell_selector) {
|
|
if let Some(record) = parse_content_cell(&cell) {
|
|
records.push(record);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Strategy 2: Look for outer-cell structure (older format)
|
|
if records.is_empty()
|
|
&& let Ok(outer_selector) = Selector::parse("div.outer-cell")
|
|
{
|
|
for cell in document.select(&outer_selector) {
|
|
if let Some(record) = parse_outer_cell(&cell) {
|
|
records.push(record);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Strategy 3: Generic approach - look for links and timestamps
|
|
if records.is_empty()
|
|
&& let Ok(link_selector) = Selector::parse("a")
|
|
{
|
|
for link in document.select(&link_selector) {
|
|
if let Some(href) = link.value().attr("href") {
|
|
// Check if it's a search URL
|
|
if (href.contains("google.com/search?q=") || href.contains("search?q="))
|
|
&& let Some(query) = extract_query_from_url(href)
|
|
{
|
|
// Try to find nearby timestamp
|
|
let timestamp = find_nearby_timestamp(&link);
|
|
|
|
records.push(ParsedSearchRecord {
|
|
timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
|
|
query,
|
|
search_engine: Some("Google".to_string()),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(records)
|
|
}
|
|
|
|
fn parse_content_cell(cell: &scraper::ElementRef) -> Option<ParsedSearchRecord> {
|
|
let link_selector = Selector::parse("a").ok()?;
|
|
|
|
let link = cell.select(&link_selector).next()?;
|
|
let href = link.value().attr("href")?;
|
|
let query = extract_query_from_url(href)?;
|
|
|
|
// Extract timestamp from cell text
|
|
let cell_text = cell.text().collect::<Vec<_>>().join(" ");
|
|
let timestamp = parse_timestamp_from_text(&cell_text);
|
|
|
|
Some(ParsedSearchRecord {
|
|
timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
|
|
query,
|
|
search_engine: Some("Google".to_string()),
|
|
})
|
|
}
|
|
|
|
fn parse_outer_cell(cell: &scraper::ElementRef) -> Option<ParsedSearchRecord> {
|
|
let link_selector = Selector::parse("a").ok()?;
|
|
|
|
let link = cell.select(&link_selector).next()?;
|
|
let href = link.value().attr("href")?;
|
|
let query = extract_query_from_url(href)?;
|
|
|
|
let cell_text = cell.text().collect::<Vec<_>>().join(" ");
|
|
let timestamp = parse_timestamp_from_text(&cell_text);
|
|
|
|
Some(ParsedSearchRecord {
|
|
timestamp: timestamp.unwrap_or_else(|| Utc::now().timestamp()),
|
|
query,
|
|
search_engine: Some("Google".to_string()),
|
|
})
|
|
}
|
|
|
|
fn extract_query_from_url(url: &str) -> Option<String> {
|
|
// Extract query parameter from URL
|
|
// Example: https://www.google.com/search?q=rust+programming
|
|
|
|
if let Some(query_start) = url.find("?q=").or_else(|| url.find("&q=")) {
|
|
let query_part = &url[query_start + 3..];
|
|
let query_end = query_part.find('&').unwrap_or(query_part.len());
|
|
let encoded_query = &query_part[..query_end];
|
|
|
|
// URL decode
|
|
urlencoding::decode(encoded_query)
|
|
.ok()
|
|
.map(|s| s.to_string())
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
fn find_nearby_timestamp(element: &scraper::ElementRef) -> Option<i64> {
|
|
// Look for timestamp in parent or sibling elements
|
|
if let Some(parent) = element.parent()
|
|
&& parent.value().as_element().is_some()
|
|
{
|
|
let parent_ref = scraper::ElementRef::wrap(parent)?;
|
|
let text = parent_ref.text().collect::<Vec<_>>().join(" ");
|
|
return parse_timestamp_from_text(&text);
|
|
}
|
|
None
|
|
}
|
|
|
|
fn parse_timestamp_from_text(text: &str) -> Option<i64> {
|
|
// Google Takeout timestamps often look like:
|
|
// "Aug 15, 2024, 2:34:56 PM PDT"
|
|
// "2024-08-15T14:34:56Z"
|
|
|
|
// Try ISO8601 first
|
|
if let Some(iso_match) = text
|
|
.split_whitespace()
|
|
.find(|s| s.contains('T') && s.contains('-'))
|
|
&& let Ok(dt) = DateTime::parse_from_rfc3339(iso_match)
|
|
{
|
|
return Some(dt.timestamp());
|
|
}
|
|
|
|
// Try common date patterns
|
|
let patterns = [
|
|
"%b %d, %Y, %I:%M:%S %p", // Aug 15, 2024, 2:34:56 PM
|
|
"%Y-%m-%d %H:%M:%S", // 2024-08-15 14:34:56
|
|
"%m/%d/%Y %H:%M:%S", // 08/15/2024 14:34:56
|
|
];
|
|
|
|
for pattern in patterns {
|
|
// Extract potential date string
|
|
if let Some(date_part) = extract_date_substring(text)
|
|
&& let Ok(dt) = NaiveDateTime::parse_from_str(&date_part, pattern)
|
|
{
|
|
return Some(dt.and_utc().timestamp());
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn extract_date_substring(text: &str) -> Option<String> {
|
|
// Try to extract date-like substring from text
|
|
// This is a heuristic approach for varied formats
|
|
|
|
// Look for patterns like "Aug 15, 2024, 2:34:56 PM"
|
|
if let Some(pos) = text.find(|c: char| c.is_numeric()) {
|
|
let rest = &text[pos..];
|
|
if let Some(end) =
|
|
rest.find(|c: char| !c.is_alphanumeric() && c != ':' && c != ',' && c != ' ')
|
|
{
|
|
Some(rest[..end].trim().to_string())
|
|
} else {
|
|
Some(rest.trim().to_string())
|
|
}
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_extract_query_from_url() {
|
|
let url = "https://www.google.com/search?q=rust+programming&oq=rust";
|
|
let query = extract_query_from_url(url);
|
|
assert_eq!(query, Some("rust+programming".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_query_with_encoding() {
|
|
let url = "https://www.google.com/search?q=hello%20world";
|
|
let query = extract_query_from_url(url);
|
|
assert_eq!(query, Some("hello world".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_iso_timestamp() {
|
|
let text = "Some text 2024-08-15T14:34:56Z more text";
|
|
let timestamp = parse_timestamp_from_text(text);
|
|
assert!(timestamp.is_some());
|
|
}
|
|
}
|