feat: add EPUB format support with spine-aware image extraction

Parse EPUB structure (container.xml → OPF → spine → XHTML) to extract images in reading order. Zero new dependencies — reuses zip + regex crates with pre-compiled regexes and per-file index cache for performance. Falls back to CBZ-style image listing when spine contains no images. Includes DB migration, API/indexer/backoffice updates. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 07:05:47 +01:00
parent 3daa49ae6c
commit 736b8aedc0
8 changed files with 359 additions and 3 deletions
--- a/apps/api/src/books.rs
+++ b/apps/api/src/books.rs
@@ -102,7 +102,7 @@ pub struct BookDetails {
    tag = "books",
    params(
        ("library_id" = Option<String>, Query, description = "Filter by library ID"),
-        ("kind" = Option<String>, Query, description = "Filter by book kind (cbz, cbr, pdf)"),
+        ("kind" = Option<String>, Query, description = "Filter by book kind (cbz, cbr, pdf, epub)"),
        ("series" = Option<String>, Query, description = "Filter by series name (use 'unclassified' for books without series)"),
        ("reading_status" = Option<String>, Query, description = "Filter by reading status, comma-separated (e.g. 'unread,reading')"),
        ("page" = Option<i64>, Query, description = "Page number (1-indexed, default 1)"),
--- a/apps/api/src/pages.rs
+++ b/apps/api/src/pages.rs
@@ -351,6 +351,7 @@ async fn prefetch_page(state: AppState, params: &PrefetchParams<'_>) {
        Some(ref e) if e == "cbz" => "cbz",
        Some(ref e) if e == "cbr" => "cbr",
        Some(ref e) if e == "pdf" => "pdf",
+        Some(ref e) if e == "epub" => "epub",
        _ => return,
    }
    .to_string();
@@ -479,6 +480,7 @@ fn render_page(
        "cbz" => parsers::BookFormat::Cbz,
        "cbr" => parsers::BookFormat::Cbr,
        "pdf" => parsers::BookFormat::Pdf,
+        "epub" => parsers::BookFormat::Epub,
        _ => return Err(ApiError::bad_request("unsupported source format")),
    };

--- a/apps/api/src/search.rs
+++ b/apps/api/src/search.rs
@@ -47,7 +47,7 @@ pub struct SearchResponse {
    params(
        ("q" = String, Query, description = "Search query (books + series via PostgreSQL full-text)"),
        ("library_id" = Option<String>, Query, description = "Filter by library ID"),
-        ("type" = Option<String>, Query, description = "Filter by type (cbz, cbr, pdf)"),
+        ("type" = Option<String>, Query, description = "Filter by type (cbz, cbr, pdf, epub)"),
        ("kind" = Option<String>, Query, description = "Filter by kind (alias for type)"),
        ("limit" = Option<usize>, Query, description = "Max results per type (max 100)"),
    ),
--- a/apps/backoffice/app/components/BookCard.tsx
+++ b/apps/backoffice/app/components/BookCard.tsx
@@ -115,6 +115,7 @@ export function BookCard({ book, readingStatus }: BookCardProps) {
              ${(book.format ?? book.kind) === 'cbz' ? 'bg-success/10 text-success' : ''}
              ${(book.format ?? book.kind) === 'cbr' ? 'bg-warning/10 text-warning' : ''}
              ${(book.format ?? book.kind) === 'pdf' ? 'bg-destructive/10 text-destructive' : ''}
+              ${(book.format ?? book.kind) === 'epub' ? 'bg-info/10 text-info' : ''}
            `}>
              {book.format ?? book.kind}
            </span>
--- a/apps/indexer/src/analyzer.rs
+++ b/apps/indexer/src/analyzer.rs
@@ -290,6 +290,7 @@ fn book_format_from_str(s: &str) -> Option<BookFormat> {
        "cbz" => Some(BookFormat::Cbz),
        "cbr" => Some(BookFormat::Cbr),
        "pdf" => Some(BookFormat::Pdf),
+        "epub" => Some(BookFormat::Epub),
        _ => None,
    }
 }
--- a/apps/indexer/src/utils.rs
+++ b/apps/indexer/src/utils.rs
@@ -40,7 +40,7 @@ pub fn compute_fingerprint(path: &Path, size: u64, mtime: &DateTime<Utc>) -> Res

 pub fn kind_from_format(format: BookFormat) -> &'static str {
    match format {
-        BookFormat::Pdf => "ebook",
+        BookFormat::Pdf | BookFormat::Epub => "ebook",
        BookFormat::Cbz | BookFormat::Cbr => "comic",
    }
 }
--- a/crates/parsers/src/lib.rs
+++ b/crates/parsers/src/lib.rs
@@ -9,6 +9,7 @@ pub enum BookFormat {
    Cbz,
    Cbr,
    Pdf,
+    Epub,
 }

 impl BookFormat {
@@ -17,6 +18,7 @@ impl BookFormat {
            Self::Cbz => "cbz",
            Self::Cbr => "cbr",
            Self::Pdf => "pdf",
+            Self::Epub => "epub",
        }
    }
 }
@@ -35,6 +37,7 @@ pub fn detect_format(path: &Path) -> Option<BookFormat> {
        "cbz" => Some(BookFormat::Cbz),
        "cbr" => Some(BookFormat::Cbr),
        "pdf" => Some(BookFormat::Pdf),
+        "epub" => Some(BookFormat::Epub),
        _ => None,
    }
 }
@@ -144,6 +147,7 @@ pub fn parse_metadata(
        BookFormat::Cbz => parse_cbz_page_count(path).ok(),
        BookFormat::Cbr => parse_cbr_page_count(path).ok(),
        BookFormat::Pdf => parse_pdf_page_count(path).ok(),
+        BookFormat::Epub => parse_epub_page_count(path).ok(),
    };

    Ok(meta)
@@ -156,6 +160,7 @@ pub fn analyze_book(path: &Path, format: BookFormat, pdf_render_scale: u32) -> R
        BookFormat::Cbz => analyze_cbz(path, true),
        BookFormat::Cbr => analyze_cbr(path, true),
        BookFormat::Pdf => analyze_pdf(path, pdf_render_scale),
+        BookFormat::Epub => analyze_epub(path),
    }
 }

@@ -530,6 +535,7 @@ pub fn list_archive_images(path: &Path, format: BookFormat) -> Result<Vec<String
        BookFormat::Cbz => list_cbz_images(path),
        BookFormat::Cbr => list_cbr_images(path),
        BookFormat::Pdf => Err(anyhow::anyhow!("list_archive_images not applicable for PDF")),
+        BookFormat::Epub => get_epub_image_index(path),
    }
 }

@@ -629,6 +635,7 @@ pub fn extract_image_by_name(path: &Path, format: BookFormat, image_name: &str)
        BookFormat::Cbz => extract_cbz_by_name(path, image_name),
        BookFormat::Cbr => extract_cbr_by_name(path, image_name),
        BookFormat::Pdf => Err(anyhow::anyhow!("use extract_page for PDF")),
+        BookFormat::Epub => extract_cbz_by_name(path, image_name),
    }
 }

@@ -721,6 +728,7 @@ pub fn extract_page(path: &Path, format: BookFormat, page_number: u32, pdf_rende
            let width = if pdf_render_width == 0 { 1200 } else { pdf_render_width };
            render_pdf_page_n(path, page_number, width)
        }
+        BookFormat::Epub => extract_epub_page(path, page_number),
    }
 }

@@ -894,6 +902,340 @@ fn render_pdf_page_n(path: &Path, page_number: u32, width: u32) -> Result<Vec<u8
 }


+// ============================================================
+// EPUB support — spine-aware image index with cache
+// ============================================================
+
+/// Cache of ordered image paths per EPUB file. Avoids re-parsing OPF/XHTML on every page request.
+static EPUB_INDEX_CACHE: OnceLock<Mutex<HashMap<PathBuf, Vec<String>>>> = OnceLock::new();
+
+fn epub_index_cache() -> &'static Mutex<HashMap<PathBuf, Vec<String>>> {
+    EPUB_INDEX_CACHE.get_or_init(|| Mutex::new(HashMap::new()))
+}
+
+// Pre-compiled regex patterns for EPUB XML parsing (compiled once on first use)
+static RE_EPUB_ROOTFILE: OnceLock<regex::Regex> = OnceLock::new();
+static RE_EPUB_ITEM: OnceLock<regex::Regex> = OnceLock::new();
+static RE_EPUB_ITEMREF: OnceLock<regex::Regex> = OnceLock::new();
+static RE_EPUB_IMG_SRC: OnceLock<regex::Regex> = OnceLock::new();
+static RE_EPUB_SVG_HREF: OnceLock<regex::Regex> = OnceLock::new();
+static RE_EPUB_ATTR_ID: OnceLock<regex::Regex> = OnceLock::new();
+static RE_EPUB_ATTR_HREF: OnceLock<regex::Regex> = OnceLock::new();
+static RE_EPUB_ATTR_MEDIA: OnceLock<regex::Regex> = OnceLock::new();
+
+struct EpubManifestItem {
+    href: String,
+    media_type: String,
+}
+
+/// Build the ordered list of image paths for an EPUB file.
+/// Walks the OPF spine to determine reading order, parses XHTML/SVG pages
+/// for image references, and falls back to CBZ-style listing if no
+/// images are found through the spine.
+fn build_epub_image_index(path: &Path) -> Result<Vec<String>> {
+    let file = std::fs::File::open(path)
+        .with_context(|| format!("cannot open epub: {}", path.display()))?;
+    let mut archive = zip::ZipArchive::new(file)
+        .with_context(|| format!("invalid epub zip: {}", path.display()))?;
+
+    // 1. Find OPF path from META-INF/container.xml
+    let opf_path = {
+        let mut entry = archive
+            .by_name("META-INF/container.xml")
+            .context("missing META-INF/container.xml — not a valid EPUB")?;
+        let mut buf = Vec::new();
+        entry.read_to_end(&mut buf)?;
+        let xml = String::from_utf8_lossy(&buf);
+        let re = RE_EPUB_ROOTFILE.get_or_init(|| {
+            regex::Regex::new(r#"<(?:\w+:)?rootfile[^>]+full-path="([^"]+)""#).unwrap()
+        });
+        re.captures(&xml)
+            .and_then(|c| c.get(1))
+            .map(|m| decode_xml_entities(m.as_str()))
+            .context("no rootfile found in container.xml")?
+    };
+
+    let opf_dir = std::path::Path::new(&opf_path)
+        .parent()
+        .map(|p| p.to_string_lossy().to_string())
+        .unwrap_or_default();
+
+    // 2. Parse OPF manifest + spine
+    let (manifest, spine_idrefs) = {
+        let mut entry = archive
+            .by_name(&opf_path)
+            .with_context(|| format!("missing OPF file: {}", opf_path))?;
+        let mut buf = Vec::new();
+        entry.read_to_end(&mut buf)?;
+        let xml = String::from_utf8_lossy(&buf);
+        parse_epub_opf(&xml, &opf_dir)?
+    };
+
+    // 3. Walk spine entries to build ordered image list
+    let re_img = RE_EPUB_IMG_SRC.get_or_init(|| {
+        regex::Regex::new(r#"(?i)<img\s[^>]*src=["']([^"']+)["']"#).unwrap()
+    });
+    let re_svg = RE_EPUB_SVG_HREF.get_or_init(|| {
+        regex::Regex::new(r#"(?i)<image\s[^>]*(?:xlink:)?href=["']([^"']+)["']"#).unwrap()
+    });
+
+    let mut images: Vec<String> = Vec::new();
+    let mut seen = std::collections::HashSet::new();
+
+    for idref in &spine_idrefs {
+        let item = match manifest.get(idref.as_str()) {
+            Some(item) => item,
+            None => continue,
+        };
+
+        // Direct raster image in spine (rare but possible)
+        if item.media_type.starts_with("image/") && !item.media_type.contains("svg") {
+            if seen.insert(item.href.clone()) {
+                images.push(item.href.clone());
+            }
+            continue;
+        }
+
+        // Read XHTML/SVG content — entry is dropped at end of match arm, releasing archive borrow
+        let content = match archive.by_name(&item.href) {
+            Ok(mut entry) => {
+                let mut buf = Vec::new();
+                match entry.read_to_end(&mut buf) {
+                    Ok(_) => String::from_utf8_lossy(&buf).to_string(),
+                    Err(_) => continue,
+                }
+            }
+            Err(_) => continue,
+        };
+
+        let content_dir = std::path::Path::new(&item.href)
+            .parent()
+            .map(|p| p.to_string_lossy().to_string())
+            .unwrap_or_default();
+
+        // Extract <img src="..."> and <image [xlink:]href="...">
+        for re in [re_img, re_svg] {
+            for cap in re.captures_iter(&content) {
+                if let Some(src) = cap.get(1) {
+                    let src_str = src.as_str();
+                    if src_str.starts_with("data:") {
+                        continue;
+                    }
+                    let decoded = decode_xml_entities(&percent_decode_epub(src_str));
+                    let resolved = resolve_epub_path(&content_dir, &decoded);
+                    if seen.insert(resolved.clone()) {
+                        images.push(resolved);
+                    }
+                }
+            }
+        }
+    }
+
+    // 4. Fallback: no images from spine → list all images in ZIP (CBZ-style)
+    if images.is_empty() {
+        for i in 0..archive.len() {
+            if let Ok(entry) = archive.by_index(i) {
+                let name = entry.name().to_string();
+                if is_image_name(&name.to_ascii_lowercase()) && seen.insert(name.clone()) {
+                    images.push(name);
+                }
+            }
+        }
+        images.sort_by(|a, b| natord::compare(a, b));
+    }
+
+    if images.is_empty() {
+        return Err(anyhow::anyhow!("no images found in epub: {}", path.display()));
+    }
+
+    Ok(images)
+}
+
+fn parse_epub_opf(
+    xml: &str,
+    opf_dir: &str,
+) -> Result<(HashMap<String, EpubManifestItem>, Vec<String>)> {
+    let re_item = RE_EPUB_ITEM.get_or_init(|| {
+        regex::Regex::new(r#"(?s)<(?:\w+:)?item\s([^>]+?)/?>"#).unwrap()
+    });
+    let re_itemref = RE_EPUB_ITEMREF.get_or_init(|| {
+        regex::Regex::new(r#"<(?:\w+:)?itemref\s[^>]*idref="([^"]+)""#).unwrap()
+    });
+    let re_id = RE_EPUB_ATTR_ID.get_or_init(|| {
+        regex::Regex::new(r#"(?:^|\s)id="([^"]+)""#).unwrap()
+    });
+    let re_href = RE_EPUB_ATTR_HREF.get_or_init(|| {
+        regex::Regex::new(r#"(?:^|\s)href="([^"]+)""#).unwrap()
+    });
+    let re_media = RE_EPUB_ATTR_MEDIA.get_or_init(|| {
+        regex::Regex::new(r#"media-type="([^"]+)""#).unwrap()
+    });
+
+    let mut manifest: HashMap<String, EpubManifestItem> = HashMap::new();
+    for cap in re_item.captures_iter(xml) {
+        if let Some(attrs) = cap.get(1) {
+            let a = attrs.as_str();
+            let id = re_id.captures(a).and_then(|c| c.get(1));
+            let href = re_href.captures(a).and_then(|c| c.get(1));
+            let media = re_media.captures(a).and_then(|c| c.get(1));
+
+            if let (Some(id), Some(href), Some(media)) = (id, href, media) {
+                let decoded_href = decode_xml_entities(&percent_decode_epub(href.as_str()));
+                let resolved = resolve_epub_path(opf_dir, &decoded_href);
+                manifest.insert(
+                    id.as_str().to_string(),
+                    EpubManifestItem {
+                        href: resolved,
+                        media_type: media.as_str().to_string(),
+                    },
+                );
+            }
+        }
+    }
+
+    let spine_idrefs: Vec<String> = re_itemref
+        .captures_iter(xml)
+        .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
+        .collect();
+
+    Ok((manifest, spine_idrefs))
+}
+
+/// Get the cached image index for an EPUB, building it on first access.
+fn get_epub_image_index(path: &Path) -> Result<Vec<String>> {
+    {
+        let cache = epub_index_cache().lock().unwrap();
+        if let Some(names) = cache.get(path) {
+            return Ok(names.clone());
+        }
+    }
+    let images = build_epub_image_index(path)?;
+    {
+        let mut cache = epub_index_cache().lock().unwrap();
+        cache.insert(path.to_path_buf(), images.clone());
+    }
+    Ok(images)
+}
+
+fn parse_epub_page_count(path: &Path) -> Result<i32> {
+    let images = build_epub_image_index(path)?;
+    Ok(images.len() as i32)
+}
+
+fn analyze_epub(path: &Path) -> Result<(i32, Vec<u8>)> {
+    let images = get_epub_image_index(path)?;
+    let count = images.len() as i32;
+
+    let file = std::fs::File::open(path)
+        .with_context(|| format!("cannot open epub: {}", path.display()))?;
+    let mut archive = zip::ZipArchive::new(file)?;
+
+    for img_path in &images {
+        if let Ok(mut entry) = archive.by_name(img_path) {
+            let mut buf = Vec::new();
+            if entry.read_to_end(&mut buf).is_ok() && !buf.is_empty() {
+                return Ok((count, buf));
+            }
+        }
+    }
+
+    Err(anyhow::anyhow!(
+        "no readable images in epub: {}",
+        path.display()
+    ))
+}
+
+fn extract_epub_page(path: &Path, page_number: u32) -> Result<Vec<u8>> {
+    let images = get_epub_image_index(path)?;
+    let index = page_number as usize - 1;
+    let img_path = images
+        .get(index)
+        .with_context(|| {
+            format!(
+                "page {} out of range (total: {})",
+                page_number,
+                images.len()
+            )
+        })?;
+
+    let file = std::fs::File::open(path)
+        .with_context(|| format!("cannot open epub: {}", path.display()))?;
+    let mut archive = zip::ZipArchive::new(file)?;
+    let mut entry = archive
+        .by_name(img_path)
+        .with_context(|| format!("image '{}' not found in epub", img_path))?;
+    let mut buf = Vec::new();
+    entry.read_to_end(&mut buf)?;
+    Ok(buf)
+}
+
+// --- EPUB path/encoding helpers ---
+
+fn resolve_epub_path(base_dir: &str, href: &str) -> String {
+    if let Some(stripped) = href.strip_prefix('/') {
+        return normalize_epub_path(stripped);
+    }
+    if base_dir.is_empty() {
+        return normalize_epub_path(href);
+    }
+    normalize_epub_path(&format!("{}/{}", base_dir, href))
+}
+
+fn normalize_epub_path(path: &str) -> String {
+    let mut parts: Vec<&str> = Vec::new();
+    for part in path.split('/') {
+        match part {
+            ".." => {
+                parts.pop();
+            }
+            "." | "" => {}
+            _ => parts.push(part),
+        }
+    }
+    parts.join("/")
+}
+
+fn percent_decode_epub(s: &str) -> String {
+    if !s.contains('%') {
+        return s.to_string();
+    }
+    let bytes = s.as_bytes();
+    let mut result = Vec::with_capacity(bytes.len());
+    let mut i = 0;
+    while i < bytes.len() {
+        if bytes[i] == b'%' && i + 2 < bytes.len() {
+            if let (Some(h), Some(l)) = (epub_hex_val(bytes[i + 1]), epub_hex_val(bytes[i + 2])) {
+                result.push(h * 16 + l);
+                i += 3;
+                continue;
+            }
+        }
+        result.push(bytes[i]);
+        i += 1;
+    }
+    String::from_utf8_lossy(&result).to_string()
+}
+
+fn epub_hex_val(b: u8) -> Option<u8> {
+    match b {
+        b'0'..=b'9' => Some(b - b'0'),
+        b'a'..=b'f' => Some(b - b'a' + 10),
+        b'A'..=b'F' => Some(b - b'A' + 10),
+        _ => None,
+    }
+}
+
+fn decode_xml_entities(s: &str) -> String {
+    if !s.contains('&') {
+        return s.to_string();
+    }
+    s.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&apos;", "'")
+}
+
 /// Convert a CBR file to CBZ in-place (same directory, same stem).
 ///
 /// The conversion is safe: a `.cbz.tmp` file is written first, verified, then
--- a/infra/migrations/0046_add_epub_format.sql
+++ b/infra/migrations/0046_add_epub_format.sql
@@ -0,0 +1,10 @@
+-- Add EPUB to allowed format values in book_files and books tables.
+-- PostgreSQL CHECK constraints are dropped+recreated (no ALTER CONSTRAINT).
+
+-- book_files.format
+ALTER TABLE book_files DROP CONSTRAINT IF EXISTS book_files_format_check;
+ALTER TABLE book_files ADD CONSTRAINT book_files_format_check CHECK (format IN ('pdf', 'cbz', 'cbr', 'epub'));
+
+-- books.format (denormalized column added in 0020)
+ALTER TABLE books DROP CONSTRAINT IF EXISTS books_format_check;
+ALTER TABLE books ADD CONSTRAINT books_format_check CHECK (format IN ('pdf', 'cbz', 'cbr', 'epub'));