feature/hls-content-hash #95

Merged
cameron merged 10 commits from feature/hls-content-hash into master 2026-05-15 20:09:49 +00:00
3 changed files with 297 additions and 144 deletions
Showing only changes of commit 7c153596fe - Show all commits

View File

@@ -13,92 +13,298 @@ use actix_web::{
use log::{debug, error, info, warn}; use log::{debug, error, info, warn};
use opentelemetry::trace::{Span, Status, Tracer}; use opentelemetry::trace::{Span, Status, Tracer};
use opentelemetry::{KeyValue, global}; use opentelemetry::{KeyValue, global};
use serde::Serialize;
use crate::content_hash;
use crate::data::{ use crate::data::{
Claims, PreviewClipRequest, PreviewStatusItem, PreviewStatusRequest, PreviewStatusResponse, Claims, PreviewClipRequest, PreviewStatusItem, PreviewStatusRequest, PreviewStatusResponse,
ThumbnailRequest, ThumbnailRequest,
}; };
use crate::database::PreviewDao; use crate::database::{ExifDao, PreviewDao};
use crate::files::is_valid_full_path; use crate::files::is_valid_full_path;
use crate::libraries; use crate::libraries;
use crate::otel::{extract_context_from_request, global_tracer}; use crate::otel::{extract_context_from_request, global_tracer};
use crate::state::AppState; use crate::state::AppState;
use crate::video::actors::{GeneratePreviewClipMessage, ProcessMessage, create_playlist}; use crate::video::actors::{GeneratePreviewClipMessage, QueueVideosMessage, VideoToQueue};
use crate::video::hls_paths;
/// Response body for `POST /video/generate`. New clients should consume
/// `playlist_url` (hash-keyed, stable across libraries and renames) and
/// poll for readiness via the URL itself. Legacy clients reading the
/// raw `playlist` string will be served the legacy basename-keyed path
/// for as long as the field exists — that field will be dropped once
/// every shipped client has migrated.
#[derive(Serialize, Debug)]
struct GenerateVideoResponse {
/// Hash-keyed URL to the HLS playlist. Resolves to
/// `$VIDEO_PATH/<shard>/<hash>/playlist.m3u8` server-side. Relative
/// segment refs inside the playlist resolve correctly because the
/// browser appends to this URL's path.
playlist_url: String,
/// blake3 content hash of the source video. Stable per byte content,
/// so duplicate uploads / archive ingests share one set of HLS
/// output.
content_hash: String,
/// `true` iff the playlist file is already on disk. `false` means a
/// transcode was queued; clients should retry the URL after a short
/// delay (or rely on HLS.js's own retry policy).
ready: bool,
/// Legacy basename-keyed playlist *path string*. Returned for older
/// clients that read the response body as a single string under the
/// pre-2026-05 wire format. New clients should ignore this field.
#[serde(rename = "playlist")]
legacy_path: String,
}
#[post("/video/generate")] #[post("/video/generate")]
pub async fn generate_video( pub async fn generate_video(
_claims: Claims, _claims: Claims,
request: HttpRequest, request: HttpRequest,
app_state: Data<AppState>, app_state: Data<AppState>,
exif_dao: Data<std::sync::Mutex<Box<dyn ExifDao>>>,
body: web::Json<ThumbnailRequest>, body: web::Json<ThumbnailRequest>,
) -> impl Responder { ) -> impl Responder {
let tracer = global_tracer(); let tracer = global_tracer();
let context = extract_context_from_request(&request); let context = extract_context_from_request(&request);
let mut span = tracer.start_with_context("generate_video", &context); let mut span = tracer.start_with_context("generate_video", &context);
let filename = PathBuf::from(&body.path); let filename_pb = PathBuf::from(&body.path);
let Some(filename) = filename_pb
.file_name()
.and_then(|n| n.to_str())
.map(str::to_string)
else {
let message = format!("Unable to get file name: {:?}", &body.path);
error!("{}", message);
span.set_status(Status::error(message));
return HttpResponse::BadRequest().finish();
};
if let Some(name) = filename.file_name() { let preferred_library =
let filename = name.to_str().expect("Filename should convert to string"); libraries::resolve_library_param(&app_state, body.library.as_deref())
// KNOWN ISSUE (multi-library): playlist filename is the basename
// alone, so two source files with the same basename — whether in
// different libraries or different subdirs of one library —
// overwrite each other's playlists while ffmpeg runs. The
// hash-keyed `content_hash::hls_dir` is the long-term answer
// (see CLAUDE.md "Multi-library data model"); rewiring the
// actor pipeline to use it is out of scope for this branch.
// The orphan-cleanup job above already walks every library so
// it doesn't false-delete archive playlists.
let playlist = format!("{}/{}.m3u8", app_state.video_path, filename);
let library = libraries::resolve_library_param(&app_state, body.library.as_deref())
.ok() .ok()
.flatten() .flatten()
.unwrap_or_else(|| app_state.primary_library()); .unwrap_or_else(|| app_state.primary_library());
// Try the resolved library first, then fall back to any other library // Try the resolved library first, then fall back to any other library
// that actually contains the file — handles union-mode requests where // that actually contains the file — handles union-mode requests where
// the mobile client passes no library but the file lives in a // the mobile client passes no library but the file lives in a
// non-primary library. // non-primary library. Track which library won so the DB lookup is
let resolved = is_valid_full_path(&library.root_path, &body.path, false) // scoped correctly.
.filter(|p| p.exists()) let resolved = is_valid_full_path(&preferred_library.root_path, &body.path, false)
.or_else(|| { .filter(|p| p.exists())
app_state.libraries.iter().find_map(|lib| { .map(|p| (preferred_library.id, preferred_library.root_path.clone(), p))
if lib.id == library.id { .or_else(|| {
return None; app_state.libraries.iter().find_map(|lib| {
} if lib.id == preferred_library.id {
is_valid_full_path(&lib.root_path, &body.path, false).filter(|p| p.exists()) return None;
}) }
}); is_valid_full_path(&lib.root_path, &body.path, false)
.filter(|p| p.exists())
.map(|p| (lib.id, lib.root_path.clone(), p))
})
});
if let Some(path) = resolved { let Some((resolved_library_id, resolved_root, full_path)) = resolved else {
if let Ok(child) = create_playlist(path.to_str().unwrap(), &playlist).await { span.set_status(Status::error(format!("invalid path {:?}", &body.path)));
span.add_event( return HttpResponse::BadRequest().finish();
"playlist_created".to_string(), };
vec![KeyValue::new("playlist-name", filename.to_string())],
// Build the rel_path used to look up the row.
let full_path_str = full_path.to_string_lossy().to_string();
let rel_path = full_path_str
.strip_prefix(&resolved_root)
.unwrap_or(full_path_str.as_str())
.trim_start_matches(['/', '\\'])
.to_string();
// DB lookup first. Cheap and avoids re-reading the file off disk for
// already-ingested videos.
let hash_from_db: Option<String> = {
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
match dao.get_exif_batch(&context, Some(resolved_library_id), &[rel_path.clone()]) {
Ok(rows) => rows.into_iter().next().and_then(|r| r.content_hash),
Err(e) => {
warn!(
"exif_dao.get_exif_batch failed for {} (lib {}): {:?}",
rel_path, resolved_library_id, e
); );
None
span.set_status(Status::Ok);
app_state.stream_manager.do_send(ProcessMessage(
playlist.clone(),
child,
// opentelemetry::Context::new().with_span(span),
));
} }
} else {
span.set_status(Status::error(format!("invalid path {:?}", &body.path)));
return HttpResponse::BadRequest().finish();
} }
};
HttpResponse::Ok().json(playlist) // Best-effort fallback: compute on-the-fly when the DB row hasn't
// been written or is mid-backfill. Read-only — no library mutation.
let content_hash_str = match hash_from_db {
Some(h) => h,
None => match content_hash::compute(&full_path) {
Ok(id) => id.content_hash,
Err(e) => {
error!(
"Failed to compute content_hash for {}: {}",
full_path.display(),
e
);
span.set_status(Status::error(format!("hash compute failed: {}", e)));
return HttpResponse::InternalServerError().finish();
}
},
};
let video_dir = std::path::Path::new(&app_state.video_path);
let playlist_path = hls_paths::playlist_for_hash(video_dir, &content_hash_str);
let sentinel_path = hls_paths::sentinel_for_hash(video_dir, &content_hash_str);
let ready = playlist_path.exists();
if !ready && !sentinel_path.exists() {
// Kick off generation via the existing actor pipeline. Fire-and-
// forget — the playlist appears at `playlist_path` once ffmpeg
// + rename complete. The client polls the URL.
info!(
"/video/generate: queueing playlist for {} (hash={})",
full_path.display(),
&content_hash_str[..content_hash_str.len().min(16)]
);
app_state.playlist_manager.do_send(QueueVideosMessage {
videos: vec![VideoToQueue {
video_path: full_path.clone(),
content_hash: content_hash_str.clone(),
}],
});
span.add_event(
"playlist_queued",
vec![KeyValue::new("content_hash", content_hash_str.clone())],
);
} else if ready {
span.add_event(
"playlist_already_present",
vec![KeyValue::new("content_hash", content_hash_str.clone())],
);
} else { } else {
let message = format!("Unable to get file name: {:?}", filename); // Sentinel present — past transcode attempt failed. Return the
error!("{}", message); // URL anyway (it'll 404 / 5xx at fetch time) so the client gets
span.set_status(Status::error(message)); // a deterministic answer. Operator must delete the sentinel to
// force a retry.
HttpResponse::BadRequest().finish() warn!(
"/video/generate: unsupported sentinel present for {} (hash={}); not re-queueing",
full_path.display(),
&content_hash_str[..content_hash_str.len().min(16)]
);
} }
let playlist_url = format!(
"/video/hls/{}/{}",
content_hash_str,
hls_paths::PLAYLIST_FILENAME
);
let legacy_path = format!("{}/{}.m3u8", app_state.video_path, filename);
span.set_status(Status::Ok);
HttpResponse::Ok().json(GenerateVideoResponse {
playlist_url,
content_hash: content_hash_str,
ready,
legacy_path,
})
}
/// Serve HLS playlist or segment files under the hash-keyed layout
/// `$VIDEO_PATH/<shard>/<hash>/<file>`. The matched `{file}` must be
/// either `playlist.m3u8` or a `segment_NNN.ts` style segment; any other
/// shape is 400'd to defend against operators stashing other content in
/// the hash dir.
#[get("/video/hls/{hash}/{file}")]
pub async fn stream_hls_file(
request: HttpRequest,
_: Claims,
path: web::Path<(String, String)>,
app_state: Data<AppState>,
) -> impl Responder {
let tracer = global_tracer();
let context = extract_context_from_request(&request);
let mut span = tracer.start_with_context("stream_hls_file", &context);
let (hash, file) = path.into_inner();
if !is_valid_hash(&hash) {
span.set_status(Status::error("invalid hash"));
return HttpResponse::BadRequest().body("invalid hash");
}
if !is_allowed_hls_filename(&file) {
span.set_status(Status::error("invalid file"));
return HttpResponse::BadRequest().body("invalid file");
}
let shard = &hash[..2];
let file_path = PathBuf::from(&app_state.video_path)
.join(shard)
.join(&hash)
.join(&file);
// Path-traversal guard: canonicalize both sides and require the file
// to live under `app_state.video_path`. `is_valid_hash` /
// `is_allowed_hls_filename` already block dangerous strings, but
// belt-and-suspenders here is cheap.
let canonical_base = match std::fs::canonicalize(&app_state.video_path) {
Ok(p) => p,
Err(e) => {
error!("Failed to canonicalize VIDEO_PATH: {:?}", e);
span.set_status(Status::error("VIDEO_PATH not canonicalisable"));
return HttpResponse::InternalServerError().finish();
}
};
let canonical_file = match std::fs::canonicalize(&file_path) {
Ok(p) => p,
Err(_) => {
debug!("HLS file not found: {}", file_path.display());
span.set_status(Status::error("not found"));
return HttpResponse::NotFound().finish();
}
};
if !canonical_file.starts_with(&canonical_base) {
warn!(
"Path traversal attempt: {} resolved outside VIDEO_PATH",
file_path.display()
);
span.set_status(Status::error("traversal"));
return HttpResponse::Forbidden().finish();
}
match NamedFile::open(&canonical_file) {
Ok(f) => {
span.set_status(Status::Ok);
f.into_response(&request)
}
Err(_) => {
span.set_status(Status::error("not found"));
HttpResponse::NotFound().finish()
}
}
}
/// 64 lowercase-or-upper hex chars. Strict so we don't accept arbitrary
/// strings that might canonicalize into trouble.
fn is_valid_hash(s: &str) -> bool {
s.len() == 64 && s.bytes().all(|b| b.is_ascii_hexdigit())
}
/// Allowed file names inside a hash dir. `playlist.m3u8` plus segment
/// files matching the `segment_NNN.ts` template that `PlaylistGenerator`
/// writes via `hls_paths::SEGMENT_TEMPLATE`. Anything else (including
/// `.tmp`, `.unsupported`, dotfiles) returns 400 — these are internal
/// artifacts the client should never request.
fn is_allowed_hls_filename(name: &str) -> bool {
if name == hls_paths::PLAYLIST_FILENAME {
return true;
}
if let Some(rest) = name.strip_prefix("segment_")
&& let Some(num) = rest.strip_suffix(".ts")
&& !num.is_empty()
&& num.bytes().all(|b| b.is_ascii_digit())
{
return true;
}
false
} }
#[get("/video/stream")] #[get("/video/stream")]
@@ -427,6 +633,41 @@ mod tests {
use crate::testhelpers::TestPreviewDao; use crate::testhelpers::TestPreviewDao;
use actix_web::App; use actix_web::App;
#[test]
fn is_valid_hash_requires_64_ascii_hex() {
assert!(is_valid_hash(&"a".repeat(64)));
assert!(is_valid_hash(&"F".repeat(64)));
assert!(is_valid_hash(&format!("ab{}", "0".repeat(62))));
assert!(!is_valid_hash(&"a".repeat(63)));
assert!(!is_valid_hash(&"a".repeat(65)));
// Anything outside the hex alphabet — including '/', '.', '..' —
// is rejected up front so the path-traversal canonicalisation
// never has to defend the boundary alone.
assert!(!is_valid_hash(&format!("/{}", "a".repeat(63))));
assert!(!is_valid_hash(&format!("..{}", "a".repeat(62))));
assert!(!is_valid_hash(&"g".repeat(64)));
}
#[test]
fn is_allowed_hls_filename_accepts_only_playlist_and_segments() {
assert!(is_allowed_hls_filename("playlist.m3u8"));
assert!(is_allowed_hls_filename("segment_000.ts"));
assert!(is_allowed_hls_filename("segment_999.ts"));
assert!(is_allowed_hls_filename("segment_0.ts"));
// Internal artifacts the client should never request.
assert!(!is_allowed_hls_filename("playlist.m3u8.tmp"));
assert!(!is_allowed_hls_filename("playlist.unsupported"));
// Traversal / path components — defence in depth alongside
// the actix path matcher itself.
assert!(!is_allowed_hls_filename(".."));
assert!(!is_allowed_hls_filename("../etc/passwd"));
assert!(!is_allowed_hls_filename("segment_abc.ts"));
assert!(!is_allowed_hls_filename("segment_.ts"));
assert!(!is_allowed_hls_filename(""));
}
fn make_token() -> String { fn make_token() -> String {
let claims = Claims::valid_user("1".to_string()); let claims = Claims::valid_user("1".to_string());
jsonwebtoken::encode( jsonwebtoken::encode(

View File

@@ -266,6 +266,7 @@ fn main() -> std::io::Result<()> {
.service(handlers::image::upload_image) .service(handlers::image::upload_image)
.service(handlers::video::generate_video) .service(handlers::video::generate_video)
.service(handlers::video::stream_video) .service(handlers::video::stream_video)
.service(handlers::video::stream_hls_file)
.service(handlers::video::get_video_preview) .service(handlers::video::get_video_preview)
.service(handlers::video::get_preview_status) .service(handlers::video::get_preview_status)
.service(handlers::video::get_video_part) .service(handlers::video::get_video_part)

View File

@@ -5,12 +5,12 @@ use crate::otel::global_tracer;
use crate::video::ffmpeg::{generate_preview_clip, get_duration_seconds_blocking}; use crate::video::ffmpeg::{generate_preview_clip, get_duration_seconds_blocking};
use crate::video::hls_paths; use crate::video::hls_paths;
use actix::prelude::*; use actix::prelude::*;
use log::{debug, error, info, trace, warn}; use log::{debug, error, info, warn};
use opentelemetry::KeyValue; use opentelemetry::KeyValue;
use opentelemetry::trace::{Span, Status, Tracer}; use opentelemetry::trace::{Span, Status, Tracer};
use std::io::Result; use std::io::Result;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::process::{Child, Command, ExitStatus, Stdio}; use std::process::{Command, Stdio};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
// ffmpeg -i test.mp4 -c:v h264 -flags +cgop -g 30 -hls_time 3 out.m3u8 // ffmpeg -i test.mp4 -c:v h264 -flags +cgop -g 30 -hls_time 3 out.m3u8
@@ -22,31 +22,6 @@ impl Actor for StreamActor {
type Context = Context<Self>; type Context = Context<Self>;
} }
pub struct ProcessMessage(pub String, pub Child);
impl Message for ProcessMessage {
type Result = Result<ExitStatus>;
}
impl Handler<ProcessMessage> for StreamActor {
type Result = Result<ExitStatus>;
fn handle(&mut self, msg: ProcessMessage, _ctx: &mut Self::Context) -> Self::Result {
trace!("Message received");
let mut process = msg.1;
let result = process.wait();
debug!(
"Finished waiting for: {:?}. Code: {:?}",
msg.0,
result
.as_ref()
.map_or(-1, |status| status.code().unwrap_or(-1))
);
result
}
}
/// A video paired with its content hash, ready to be queued for HLS /// A video paired with its content hash, ready to be queued for HLS
/// playlist generation. Hash is required because all output paths are /// playlist generation. Hash is required because all output paths are
/// keyed on it; callers that lack a hash (rows mid-backfill) must skip /// keyed on it; callers that lack a hash (rows mid-backfill) must skip
@@ -57,70 +32,6 @@ pub struct VideoToQueue {
pub content_hash: String, pub content_hash: String,
} }
/// Legacy basename-keyed playlist path. Retained for the one-shot startup
/// migration that retires pre-content-hash output; new playlist writes go
/// through [`hls_paths::playlist_for_hash`]. Will be removed once the
/// migration ships and runs to completion in production.
#[allow(dead_code)]
pub fn playlist_file_for(playlist_dir: &str, video_path: &Path) -> PathBuf {
let filename = video_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown");
PathBuf::from(format!("{}/{}.m3u8", playlist_dir, filename))
}
/// Legacy basename-keyed sentinel path. Same migration-only contract as
/// [`playlist_file_for`].
#[allow(dead_code)]
pub fn playlist_unsupported_sentinel(playlist_file: &Path) -> PathBuf {
let mut s = playlist_file.as_os_str().to_owned();
s.push(".unsupported");
PathBuf::from(s)
}
pub async fn create_playlist(video_path: &str, playlist_file: &str) -> Result<Child> {
if Path::new(playlist_file).exists() {
debug!("Playlist already exists: {}", playlist_file);
return Err(std::io::Error::from(std::io::ErrorKind::AlreadyExists));
}
let result = Command::new("ffmpeg")
.arg("-i")
.arg(video_path)
.arg("-c:v")
.arg("h264")
.arg("-crf")
.arg("21")
.arg("-preset")
.arg("veryfast")
.arg("-hls_time")
.arg("3")
.arg("-hls_list_size")
.arg("0")
.arg("-hls_playlist_type")
.arg("vod")
.arg("-vf")
.arg("scale='min(1080,iw)':-2,setsar=1:1")
.arg(playlist_file)
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn();
let start_time = std::time::Instant::now();
loop {
actix::clock::sleep(std::time::Duration::from_secs(1)).await;
if Path::new(playlist_file).exists()
|| std::time::Instant::now() - start_time > std::time::Duration::from_secs(5)
{
break;
}
}
result
}
pub fn generate_video_thumbnail(path: &Path, destination: &Path) -> std::io::Result<()> { pub fn generate_video_thumbnail(path: &Path, destination: &Path) -> std::io::Result<()> {
// Probe duration up front and seek to ~50% — gives a more // Probe duration up front and seek to ~50% — gives a more
// representative frame than a fixed offset (skipping title cards on // representative frame than a fixed offset (skipping title cards on