From 736b8aedc0f743ee25c50e1e2b538af1fb2f249e Mon Sep 17 00:00:00 2001 From: Froidefond Julien Date: Sat, 21 Mar 2026 07:05:47 +0100 Subject: [PATCH] feat: add EPUB format support with spine-aware image extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parse EPUB structure (container.xml → OPF → spine → XHTML) to extract images in reading order. Zero new dependencies — reuses zip + regex crates with pre-compiled regexes and per-file index cache for performance. Falls back to CBZ-style image listing when spine contains no images. Includes DB migration, API/indexer/backoffice updates. Co-Authored-By: Claude Opus 4.6 --- apps/api/src/books.rs | 2 +- apps/api/src/pages.rs | 2 + apps/api/src/search.rs | 2 +- apps/backoffice/app/components/BookCard.tsx | 1 + apps/indexer/src/analyzer.rs | 1 + apps/indexer/src/utils.rs | 2 +- crates/parsers/src/lib.rs | 342 ++++++++++++++++++++ infra/migrations/0046_add_epub_format.sql | 10 + 8 files changed, 359 insertions(+), 3 deletions(-) create mode 100644 infra/migrations/0046_add_epub_format.sql diff --git a/apps/api/src/books.rs b/apps/api/src/books.rs index 0d9206c..f2aca16 100644 --- a/apps/api/src/books.rs +++ b/apps/api/src/books.rs @@ -102,7 +102,7 @@ pub struct BookDetails { tag = "books", params( ("library_id" = Option, Query, description = "Filter by library ID"), - ("kind" = Option, Query, description = "Filter by book kind (cbz, cbr, pdf)"), + ("kind" = Option, Query, description = "Filter by book kind (cbz, cbr, pdf, epub)"), ("series" = Option, Query, description = "Filter by series name (use 'unclassified' for books without series)"), ("reading_status" = Option, Query, description = "Filter by reading status, comma-separated (e.g. 'unread,reading')"), ("page" = Option, Query, description = "Page number (1-indexed, default 1)"), diff --git a/apps/api/src/pages.rs b/apps/api/src/pages.rs index 46a76d9..ef7b06b 100644 --- a/apps/api/src/pages.rs +++ b/apps/api/src/pages.rs @@ -351,6 +351,7 @@ async fn prefetch_page(state: AppState, params: &PrefetchParams<'_>) { Some(ref e) if e == "cbz" => "cbz", Some(ref e) if e == "cbr" => "cbr", Some(ref e) if e == "pdf" => "pdf", + Some(ref e) if e == "epub" => "epub", _ => return, } .to_string(); @@ -479,6 +480,7 @@ fn render_page( "cbz" => parsers::BookFormat::Cbz, "cbr" => parsers::BookFormat::Cbr, "pdf" => parsers::BookFormat::Pdf, + "epub" => parsers::BookFormat::Epub, _ => return Err(ApiError::bad_request("unsupported source format")), }; diff --git a/apps/api/src/search.rs b/apps/api/src/search.rs index f3c7298..df2c84f 100644 --- a/apps/api/src/search.rs +++ b/apps/api/src/search.rs @@ -47,7 +47,7 @@ pub struct SearchResponse { params( ("q" = String, Query, description = "Search query (books + series via PostgreSQL full-text)"), ("library_id" = Option, Query, description = "Filter by library ID"), - ("type" = Option, Query, description = "Filter by type (cbz, cbr, pdf)"), + ("type" = Option, Query, description = "Filter by type (cbz, cbr, pdf, epub)"), ("kind" = Option, Query, description = "Filter by kind (alias for type)"), ("limit" = Option, Query, description = "Max results per type (max 100)"), ), diff --git a/apps/backoffice/app/components/BookCard.tsx b/apps/backoffice/app/components/BookCard.tsx index 57c9693..d6b5fbb 100644 --- a/apps/backoffice/app/components/BookCard.tsx +++ b/apps/backoffice/app/components/BookCard.tsx @@ -115,6 +115,7 @@ export function BookCard({ book, readingStatus }: BookCardProps) { ${(book.format ?? book.kind) === 'cbz' ? 'bg-success/10 text-success' : ''} ${(book.format ?? book.kind) === 'cbr' ? 'bg-warning/10 text-warning' : ''} ${(book.format ?? book.kind) === 'pdf' ? 'bg-destructive/10 text-destructive' : ''} + ${(book.format ?? book.kind) === 'epub' ? 'bg-info/10 text-info' : ''} `}> {book.format ?? book.kind} diff --git a/apps/indexer/src/analyzer.rs b/apps/indexer/src/analyzer.rs index af4bdaf..8082d38 100644 --- a/apps/indexer/src/analyzer.rs +++ b/apps/indexer/src/analyzer.rs @@ -290,6 +290,7 @@ fn book_format_from_str(s: &str) -> Option { "cbz" => Some(BookFormat::Cbz), "cbr" => Some(BookFormat::Cbr), "pdf" => Some(BookFormat::Pdf), + "epub" => Some(BookFormat::Epub), _ => None, } } diff --git a/apps/indexer/src/utils.rs b/apps/indexer/src/utils.rs index 47add2e..ffac194 100644 --- a/apps/indexer/src/utils.rs +++ b/apps/indexer/src/utils.rs @@ -40,7 +40,7 @@ pub fn compute_fingerprint(path: &Path, size: u64, mtime: &DateTime) -> Res pub fn kind_from_format(format: BookFormat) -> &'static str { match format { - BookFormat::Pdf => "ebook", + BookFormat::Pdf | BookFormat::Epub => "ebook", BookFormat::Cbz | BookFormat::Cbr => "comic", } } diff --git a/crates/parsers/src/lib.rs b/crates/parsers/src/lib.rs index 6f8083e..7ea18e7 100644 --- a/crates/parsers/src/lib.rs +++ b/crates/parsers/src/lib.rs @@ -9,6 +9,7 @@ pub enum BookFormat { Cbz, Cbr, Pdf, + Epub, } impl BookFormat { @@ -17,6 +18,7 @@ impl BookFormat { Self::Cbz => "cbz", Self::Cbr => "cbr", Self::Pdf => "pdf", + Self::Epub => "epub", } } } @@ -35,6 +37,7 @@ pub fn detect_format(path: &Path) -> Option { "cbz" => Some(BookFormat::Cbz), "cbr" => Some(BookFormat::Cbr), "pdf" => Some(BookFormat::Pdf), + "epub" => Some(BookFormat::Epub), _ => None, } } @@ -144,6 +147,7 @@ pub fn parse_metadata( BookFormat::Cbz => parse_cbz_page_count(path).ok(), BookFormat::Cbr => parse_cbr_page_count(path).ok(), BookFormat::Pdf => parse_pdf_page_count(path).ok(), + BookFormat::Epub => parse_epub_page_count(path).ok(), }; Ok(meta) @@ -156,6 +160,7 @@ pub fn analyze_book(path: &Path, format: BookFormat, pdf_render_scale: u32) -> R BookFormat::Cbz => analyze_cbz(path, true), BookFormat::Cbr => analyze_cbr(path, true), BookFormat::Pdf => analyze_pdf(path, pdf_render_scale), + BookFormat::Epub => analyze_epub(path), } } @@ -530,6 +535,7 @@ pub fn list_archive_images(path: &Path, format: BookFormat) -> Result list_cbz_images(path), BookFormat::Cbr => list_cbr_images(path), BookFormat::Pdf => Err(anyhow::anyhow!("list_archive_images not applicable for PDF")), + BookFormat::Epub => get_epub_image_index(path), } } @@ -629,6 +635,7 @@ pub fn extract_image_by_name(path: &Path, format: BookFormat, image_name: &str) BookFormat::Cbz => extract_cbz_by_name(path, image_name), BookFormat::Cbr => extract_cbr_by_name(path, image_name), BookFormat::Pdf => Err(anyhow::anyhow!("use extract_page for PDF")), + BookFormat::Epub => extract_cbz_by_name(path, image_name), } } @@ -721,6 +728,7 @@ pub fn extract_page(path: &Path, format: BookFormat, page_number: u32, pdf_rende let width = if pdf_render_width == 0 { 1200 } else { pdf_render_width }; render_pdf_page_n(path, page_number, width) } + BookFormat::Epub => extract_epub_page(path, page_number), } } @@ -894,6 +902,340 @@ fn render_pdf_page_n(path: &Path, page_number: u32, width: u32) -> Result>>> = OnceLock::new(); + +fn epub_index_cache() -> &'static Mutex>> { + EPUB_INDEX_CACHE.get_or_init(|| Mutex::new(HashMap::new())) +} + +// Pre-compiled regex patterns for EPUB XML parsing (compiled once on first use) +static RE_EPUB_ROOTFILE: OnceLock = OnceLock::new(); +static RE_EPUB_ITEM: OnceLock = OnceLock::new(); +static RE_EPUB_ITEMREF: OnceLock = OnceLock::new(); +static RE_EPUB_IMG_SRC: OnceLock = OnceLock::new(); +static RE_EPUB_SVG_HREF: OnceLock = OnceLock::new(); +static RE_EPUB_ATTR_ID: OnceLock = OnceLock::new(); +static RE_EPUB_ATTR_HREF: OnceLock = OnceLock::new(); +static RE_EPUB_ATTR_MEDIA: OnceLock = OnceLock::new(); + +struct EpubManifestItem { + href: String, + media_type: String, +} + +/// Build the ordered list of image paths for an EPUB file. +/// Walks the OPF spine to determine reading order, parses XHTML/SVG pages +/// for image references, and falls back to CBZ-style listing if no +/// images are found through the spine. +fn build_epub_image_index(path: &Path) -> Result> { + let file = std::fs::File::open(path) + .with_context(|| format!("cannot open epub: {}", path.display()))?; + let mut archive = zip::ZipArchive::new(file) + .with_context(|| format!("invalid epub zip: {}", path.display()))?; + + // 1. Find OPF path from META-INF/container.xml + let opf_path = { + let mut entry = archive + .by_name("META-INF/container.xml") + .context("missing META-INF/container.xml — not a valid EPUB")?; + let mut buf = Vec::new(); + entry.read_to_end(&mut buf)?; + let xml = String::from_utf8_lossy(&buf); + let re = RE_EPUB_ROOTFILE.get_or_init(|| { + regex::Regex::new(r#"<(?:\w+:)?rootfile[^>]+full-path="([^"]+)""#).unwrap() + }); + re.captures(&xml) + .and_then(|c| c.get(1)) + .map(|m| decode_xml_entities(m.as_str())) + .context("no rootfile found in container.xml")? + }; + + let opf_dir = std::path::Path::new(&opf_path) + .parent() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(); + + // 2. Parse OPF manifest + spine + let (manifest, spine_idrefs) = { + let mut entry = archive + .by_name(&opf_path) + .with_context(|| format!("missing OPF file: {}", opf_path))?; + let mut buf = Vec::new(); + entry.read_to_end(&mut buf)?; + let xml = String::from_utf8_lossy(&buf); + parse_epub_opf(&xml, &opf_dir)? + }; + + // 3. Walk spine entries to build ordered image list + let re_img = RE_EPUB_IMG_SRC.get_or_init(|| { + regex::Regex::new(r#"(?i)]*src=["']([^"']+)["']"#).unwrap() + }); + let re_svg = RE_EPUB_SVG_HREF.get_or_init(|| { + regex::Regex::new(r#"(?i)]*(?:xlink:)?href=["']([^"']+)["']"#).unwrap() + }); + + let mut images: Vec = Vec::new(); + let mut seen = std::collections::HashSet::new(); + + for idref in &spine_idrefs { + let item = match manifest.get(idref.as_str()) { + Some(item) => item, + None => continue, + }; + + // Direct raster image in spine (rare but possible) + if item.media_type.starts_with("image/") && !item.media_type.contains("svg") { + if seen.insert(item.href.clone()) { + images.push(item.href.clone()); + } + continue; + } + + // Read XHTML/SVG content — entry is dropped at end of match arm, releasing archive borrow + let content = match archive.by_name(&item.href) { + Ok(mut entry) => { + let mut buf = Vec::new(); + match entry.read_to_end(&mut buf) { + Ok(_) => String::from_utf8_lossy(&buf).to_string(), + Err(_) => continue, + } + } + Err(_) => continue, + }; + + let content_dir = std::path::Path::new(&item.href) + .parent() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(); + + // Extract and + for re in [re_img, re_svg] { + for cap in re.captures_iter(&content) { + if let Some(src) = cap.get(1) { + let src_str = src.as_str(); + if src_str.starts_with("data:") { + continue; + } + let decoded = decode_xml_entities(&percent_decode_epub(src_str)); + let resolved = resolve_epub_path(&content_dir, &decoded); + if seen.insert(resolved.clone()) { + images.push(resolved); + } + } + } + } + } + + // 4. Fallback: no images from spine → list all images in ZIP (CBZ-style) + if images.is_empty() { + for i in 0..archive.len() { + if let Ok(entry) = archive.by_index(i) { + let name = entry.name().to_string(); + if is_image_name(&name.to_ascii_lowercase()) && seen.insert(name.clone()) { + images.push(name); + } + } + } + images.sort_by(|a, b| natord::compare(a, b)); + } + + if images.is_empty() { + return Err(anyhow::anyhow!("no images found in epub: {}", path.display())); + } + + Ok(images) +} + +fn parse_epub_opf( + xml: &str, + opf_dir: &str, +) -> Result<(HashMap, Vec)> { + let re_item = RE_EPUB_ITEM.get_or_init(|| { + regex::Regex::new(r#"(?s)<(?:\w+:)?item\s([^>]+?)/?>"#).unwrap() + }); + let re_itemref = RE_EPUB_ITEMREF.get_or_init(|| { + regex::Regex::new(r#"<(?:\w+:)?itemref\s[^>]*idref="([^"]+)""#).unwrap() + }); + let re_id = RE_EPUB_ATTR_ID.get_or_init(|| { + regex::Regex::new(r#"(?:^|\s)id="([^"]+)""#).unwrap() + }); + let re_href = RE_EPUB_ATTR_HREF.get_or_init(|| { + regex::Regex::new(r#"(?:^|\s)href="([^"]+)""#).unwrap() + }); + let re_media = RE_EPUB_ATTR_MEDIA.get_or_init(|| { + regex::Regex::new(r#"media-type="([^"]+)""#).unwrap() + }); + + let mut manifest: HashMap = HashMap::new(); + for cap in re_item.captures_iter(xml) { + if let Some(attrs) = cap.get(1) { + let a = attrs.as_str(); + let id = re_id.captures(a).and_then(|c| c.get(1)); + let href = re_href.captures(a).and_then(|c| c.get(1)); + let media = re_media.captures(a).and_then(|c| c.get(1)); + + if let (Some(id), Some(href), Some(media)) = (id, href, media) { + let decoded_href = decode_xml_entities(&percent_decode_epub(href.as_str())); + let resolved = resolve_epub_path(opf_dir, &decoded_href); + manifest.insert( + id.as_str().to_string(), + EpubManifestItem { + href: resolved, + media_type: media.as_str().to_string(), + }, + ); + } + } + } + + let spine_idrefs: Vec = re_itemref + .captures_iter(xml) + .filter_map(|c| c.get(1).map(|m| m.as_str().to_string())) + .collect(); + + Ok((manifest, spine_idrefs)) +} + +/// Get the cached image index for an EPUB, building it on first access. +fn get_epub_image_index(path: &Path) -> Result> { + { + let cache = epub_index_cache().lock().unwrap(); + if let Some(names) = cache.get(path) { + return Ok(names.clone()); + } + } + let images = build_epub_image_index(path)?; + { + let mut cache = epub_index_cache().lock().unwrap(); + cache.insert(path.to_path_buf(), images.clone()); + } + Ok(images) +} + +fn parse_epub_page_count(path: &Path) -> Result { + let images = build_epub_image_index(path)?; + Ok(images.len() as i32) +} + +fn analyze_epub(path: &Path) -> Result<(i32, Vec)> { + let images = get_epub_image_index(path)?; + let count = images.len() as i32; + + let file = std::fs::File::open(path) + .with_context(|| format!("cannot open epub: {}", path.display()))?; + let mut archive = zip::ZipArchive::new(file)?; + + for img_path in &images { + if let Ok(mut entry) = archive.by_name(img_path) { + let mut buf = Vec::new(); + if entry.read_to_end(&mut buf).is_ok() && !buf.is_empty() { + return Ok((count, buf)); + } + } + } + + Err(anyhow::anyhow!( + "no readable images in epub: {}", + path.display() + )) +} + +fn extract_epub_page(path: &Path, page_number: u32) -> Result> { + let images = get_epub_image_index(path)?; + let index = page_number as usize - 1; + let img_path = images + .get(index) + .with_context(|| { + format!( + "page {} out of range (total: {})", + page_number, + images.len() + ) + })?; + + let file = std::fs::File::open(path) + .with_context(|| format!("cannot open epub: {}", path.display()))?; + let mut archive = zip::ZipArchive::new(file)?; + let mut entry = archive + .by_name(img_path) + .with_context(|| format!("image '{}' not found in epub", img_path))?; + let mut buf = Vec::new(); + entry.read_to_end(&mut buf)?; + Ok(buf) +} + +// --- EPUB path/encoding helpers --- + +fn resolve_epub_path(base_dir: &str, href: &str) -> String { + if let Some(stripped) = href.strip_prefix('/') { + return normalize_epub_path(stripped); + } + if base_dir.is_empty() { + return normalize_epub_path(href); + } + normalize_epub_path(&format!("{}/{}", base_dir, href)) +} + +fn normalize_epub_path(path: &str) -> String { + let mut parts: Vec<&str> = Vec::new(); + for part in path.split('/') { + match part { + ".." => { + parts.pop(); + } + "." | "" => {} + _ => parts.push(part), + } + } + parts.join("/") +} + +fn percent_decode_epub(s: &str) -> String { + if !s.contains('%') { + return s.to_string(); + } + let bytes = s.as_bytes(); + let mut result = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'%' && i + 2 < bytes.len() { + if let (Some(h), Some(l)) = (epub_hex_val(bytes[i + 1]), epub_hex_val(bytes[i + 2])) { + result.push(h * 16 + l); + i += 3; + continue; + } + } + result.push(bytes[i]); + i += 1; + } + String::from_utf8_lossy(&result).to_string() +} + +fn epub_hex_val(b: u8) -> Option { + match b { + b'0'..=b'9' => Some(b - b'0'), + b'a'..=b'f' => Some(b - b'a' + 10), + b'A'..=b'F' => Some(b - b'A' + 10), + _ => None, + } +} + +fn decode_xml_entities(s: &str) -> String { + if !s.contains('&') { + return s.to_string(); + } + s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") +} + /// Convert a CBR file to CBZ in-place (same directory, same stem). /// /// The conversion is safe: a `.cbz.tmp` file is written first, verified, then diff --git a/infra/migrations/0046_add_epub_format.sql b/infra/migrations/0046_add_epub_format.sql new file mode 100644 index 0000000..4ad7f18 --- /dev/null +++ b/infra/migrations/0046_add_epub_format.sql @@ -0,0 +1,10 @@ +-- Add EPUB to allowed format values in book_files and books tables. +-- PostgreSQL CHECK constraints are dropped+recreated (no ALTER CONSTRAINT). + +-- book_files.format +ALTER TABLE book_files DROP CONSTRAINT IF EXISTS book_files_format_check; +ALTER TABLE book_files ADD CONSTRAINT book_files_format_check CHECK (format IN ('pdf', 'cbz', 'cbr', 'epub')); + +-- books.format (denormalized column added in 0020) +ALTER TABLE books DROP CONSTRAINT IF EXISTS books_format_check; +ALTER TABLE books ADD CONSTRAINT books_format_check CHECK (format IN ('pdf', 'cbz', 'cbr', 'epub'));