feat: add EPUB format support with spine-aware image extraction
Parse EPUB structure (container.xml → OPF → spine → XHTML) to extract images in reading order. Zero new dependencies — reuses zip + regex crates with pre-compiled regexes and per-file index cache for performance. Falls back to CBZ-style image listing when spine contains no images. Includes DB migration, API/indexer/backoffice updates. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -102,7 +102,7 @@ pub struct BookDetails {
|
||||
tag = "books",
|
||||
params(
|
||||
("library_id" = Option<String>, Query, description = "Filter by library ID"),
|
||||
("kind" = Option<String>, Query, description = "Filter by book kind (cbz, cbr, pdf)"),
|
||||
("kind" = Option<String>, Query, description = "Filter by book kind (cbz, cbr, pdf, epub)"),
|
||||
("series" = Option<String>, Query, description = "Filter by series name (use 'unclassified' for books without series)"),
|
||||
("reading_status" = Option<String>, Query, description = "Filter by reading status, comma-separated (e.g. 'unread,reading')"),
|
||||
("page" = Option<i64>, Query, description = "Page number (1-indexed, default 1)"),
|
||||
|
||||
@@ -351,6 +351,7 @@ async fn prefetch_page(state: AppState, params: &PrefetchParams<'_>) {
|
||||
Some(ref e) if e == "cbz" => "cbz",
|
||||
Some(ref e) if e == "cbr" => "cbr",
|
||||
Some(ref e) if e == "pdf" => "pdf",
|
||||
Some(ref e) if e == "epub" => "epub",
|
||||
_ => return,
|
||||
}
|
||||
.to_string();
|
||||
@@ -479,6 +480,7 @@ fn render_page(
|
||||
"cbz" => parsers::BookFormat::Cbz,
|
||||
"cbr" => parsers::BookFormat::Cbr,
|
||||
"pdf" => parsers::BookFormat::Pdf,
|
||||
"epub" => parsers::BookFormat::Epub,
|
||||
_ => return Err(ApiError::bad_request("unsupported source format")),
|
||||
};
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ pub struct SearchResponse {
|
||||
params(
|
||||
("q" = String, Query, description = "Search query (books + series via PostgreSQL full-text)"),
|
||||
("library_id" = Option<String>, Query, description = "Filter by library ID"),
|
||||
("type" = Option<String>, Query, description = "Filter by type (cbz, cbr, pdf)"),
|
||||
("type" = Option<String>, Query, description = "Filter by type (cbz, cbr, pdf, epub)"),
|
||||
("kind" = Option<String>, Query, description = "Filter by kind (alias for type)"),
|
||||
("limit" = Option<usize>, Query, description = "Max results per type (max 100)"),
|
||||
),
|
||||
|
||||
@@ -115,6 +115,7 @@ export function BookCard({ book, readingStatus }: BookCardProps) {
|
||||
${(book.format ?? book.kind) === 'cbz' ? 'bg-success/10 text-success' : ''}
|
||||
${(book.format ?? book.kind) === 'cbr' ? 'bg-warning/10 text-warning' : ''}
|
||||
${(book.format ?? book.kind) === 'pdf' ? 'bg-destructive/10 text-destructive' : ''}
|
||||
${(book.format ?? book.kind) === 'epub' ? 'bg-info/10 text-info' : ''}
|
||||
`}>
|
||||
{book.format ?? book.kind}
|
||||
</span>
|
||||
|
||||
@@ -290,6 +290,7 @@ fn book_format_from_str(s: &str) -> Option<BookFormat> {
|
||||
"cbz" => Some(BookFormat::Cbz),
|
||||
"cbr" => Some(BookFormat::Cbr),
|
||||
"pdf" => Some(BookFormat::Pdf),
|
||||
"epub" => Some(BookFormat::Epub),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ pub fn compute_fingerprint(path: &Path, size: u64, mtime: &DateTime<Utc>) -> Res
|
||||
|
||||
pub fn kind_from_format(format: BookFormat) -> &'static str {
|
||||
match format {
|
||||
BookFormat::Pdf => "ebook",
|
||||
BookFormat::Pdf | BookFormat::Epub => "ebook",
|
||||
BookFormat::Cbz | BookFormat::Cbr => "comic",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ pub enum BookFormat {
|
||||
Cbz,
|
||||
Cbr,
|
||||
Pdf,
|
||||
Epub,
|
||||
}
|
||||
|
||||
impl BookFormat {
|
||||
@@ -17,6 +18,7 @@ impl BookFormat {
|
||||
Self::Cbz => "cbz",
|
||||
Self::Cbr => "cbr",
|
||||
Self::Pdf => "pdf",
|
||||
Self::Epub => "epub",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -35,6 +37,7 @@ pub fn detect_format(path: &Path) -> Option<BookFormat> {
|
||||
"cbz" => Some(BookFormat::Cbz),
|
||||
"cbr" => Some(BookFormat::Cbr),
|
||||
"pdf" => Some(BookFormat::Pdf),
|
||||
"epub" => Some(BookFormat::Epub),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -144,6 +147,7 @@ pub fn parse_metadata(
|
||||
BookFormat::Cbz => parse_cbz_page_count(path).ok(),
|
||||
BookFormat::Cbr => parse_cbr_page_count(path).ok(),
|
||||
BookFormat::Pdf => parse_pdf_page_count(path).ok(),
|
||||
BookFormat::Epub => parse_epub_page_count(path).ok(),
|
||||
};
|
||||
|
||||
Ok(meta)
|
||||
@@ -156,6 +160,7 @@ pub fn analyze_book(path: &Path, format: BookFormat, pdf_render_scale: u32) -> R
|
||||
BookFormat::Cbz => analyze_cbz(path, true),
|
||||
BookFormat::Cbr => analyze_cbr(path, true),
|
||||
BookFormat::Pdf => analyze_pdf(path, pdf_render_scale),
|
||||
BookFormat::Epub => analyze_epub(path),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -530,6 +535,7 @@ pub fn list_archive_images(path: &Path, format: BookFormat) -> Result<Vec<String
|
||||
BookFormat::Cbz => list_cbz_images(path),
|
||||
BookFormat::Cbr => list_cbr_images(path),
|
||||
BookFormat::Pdf => Err(anyhow::anyhow!("list_archive_images not applicable for PDF")),
|
||||
BookFormat::Epub => get_epub_image_index(path),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -629,6 +635,7 @@ pub fn extract_image_by_name(path: &Path, format: BookFormat, image_name: &str)
|
||||
BookFormat::Cbz => extract_cbz_by_name(path, image_name),
|
||||
BookFormat::Cbr => extract_cbr_by_name(path, image_name),
|
||||
BookFormat::Pdf => Err(anyhow::anyhow!("use extract_page for PDF")),
|
||||
BookFormat::Epub => extract_cbz_by_name(path, image_name),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -721,6 +728,7 @@ pub fn extract_page(path: &Path, format: BookFormat, page_number: u32, pdf_rende
|
||||
let width = if pdf_render_width == 0 { 1200 } else { pdf_render_width };
|
||||
render_pdf_page_n(path, page_number, width)
|
||||
}
|
||||
BookFormat::Epub => extract_epub_page(path, page_number),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -894,6 +902,340 @@ fn render_pdf_page_n(path: &Path, page_number: u32, width: u32) -> Result<Vec<u8
|
||||
}
|
||||
|
||||
|
||||
// ============================================================
|
||||
// EPUB support — spine-aware image index with cache
|
||||
// ============================================================
|
||||
|
||||
/// Cache of ordered image paths per EPUB file. Avoids re-parsing OPF/XHTML on every page request.
|
||||
static EPUB_INDEX_CACHE: OnceLock<Mutex<HashMap<PathBuf, Vec<String>>>> = OnceLock::new();
|
||||
|
||||
fn epub_index_cache() -> &'static Mutex<HashMap<PathBuf, Vec<String>>> {
|
||||
EPUB_INDEX_CACHE.get_or_init(|| Mutex::new(HashMap::new()))
|
||||
}
|
||||
|
||||
// Pre-compiled regex patterns for EPUB XML parsing (compiled once on first use)
|
||||
static RE_EPUB_ROOTFILE: OnceLock<regex::Regex> = OnceLock::new();
|
||||
static RE_EPUB_ITEM: OnceLock<regex::Regex> = OnceLock::new();
|
||||
static RE_EPUB_ITEMREF: OnceLock<regex::Regex> = OnceLock::new();
|
||||
static RE_EPUB_IMG_SRC: OnceLock<regex::Regex> = OnceLock::new();
|
||||
static RE_EPUB_SVG_HREF: OnceLock<regex::Regex> = OnceLock::new();
|
||||
static RE_EPUB_ATTR_ID: OnceLock<regex::Regex> = OnceLock::new();
|
||||
static RE_EPUB_ATTR_HREF: OnceLock<regex::Regex> = OnceLock::new();
|
||||
static RE_EPUB_ATTR_MEDIA: OnceLock<regex::Regex> = OnceLock::new();
|
||||
|
||||
struct EpubManifestItem {
|
||||
href: String,
|
||||
media_type: String,
|
||||
}
|
||||
|
||||
/// Build the ordered list of image paths for an EPUB file.
|
||||
/// Walks the OPF spine to determine reading order, parses XHTML/SVG pages
|
||||
/// for image references, and falls back to CBZ-style listing if no
|
||||
/// images are found through the spine.
|
||||
fn build_epub_image_index(path: &Path) -> Result<Vec<String>> {
|
||||
let file = std::fs::File::open(path)
|
||||
.with_context(|| format!("cannot open epub: {}", path.display()))?;
|
||||
let mut archive = zip::ZipArchive::new(file)
|
||||
.with_context(|| format!("invalid epub zip: {}", path.display()))?;
|
||||
|
||||
// 1. Find OPF path from META-INF/container.xml
|
||||
let opf_path = {
|
||||
let mut entry = archive
|
||||
.by_name("META-INF/container.xml")
|
||||
.context("missing META-INF/container.xml — not a valid EPUB")?;
|
||||
let mut buf = Vec::new();
|
||||
entry.read_to_end(&mut buf)?;
|
||||
let xml = String::from_utf8_lossy(&buf);
|
||||
let re = RE_EPUB_ROOTFILE.get_or_init(|| {
|
||||
regex::Regex::new(r#"<(?:\w+:)?rootfile[^>]+full-path="([^"]+)""#).unwrap()
|
||||
});
|
||||
re.captures(&xml)
|
||||
.and_then(|c| c.get(1))
|
||||
.map(|m| decode_xml_entities(m.as_str()))
|
||||
.context("no rootfile found in container.xml")?
|
||||
};
|
||||
|
||||
let opf_dir = std::path::Path::new(&opf_path)
|
||||
.parent()
|
||||
.map(|p| p.to_string_lossy().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
// 2. Parse OPF manifest + spine
|
||||
let (manifest, spine_idrefs) = {
|
||||
let mut entry = archive
|
||||
.by_name(&opf_path)
|
||||
.with_context(|| format!("missing OPF file: {}", opf_path))?;
|
||||
let mut buf = Vec::new();
|
||||
entry.read_to_end(&mut buf)?;
|
||||
let xml = String::from_utf8_lossy(&buf);
|
||||
parse_epub_opf(&xml, &opf_dir)?
|
||||
};
|
||||
|
||||
// 3. Walk spine entries to build ordered image list
|
||||
let re_img = RE_EPUB_IMG_SRC.get_or_init(|| {
|
||||
regex::Regex::new(r#"(?i)<img\s[^>]*src=["']([^"']+)["']"#).unwrap()
|
||||
});
|
||||
let re_svg = RE_EPUB_SVG_HREF.get_or_init(|| {
|
||||
regex::Regex::new(r#"(?i)<image\s[^>]*(?:xlink:)?href=["']([^"']+)["']"#).unwrap()
|
||||
});
|
||||
|
||||
let mut images: Vec<String> = Vec::new();
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
|
||||
for idref in &spine_idrefs {
|
||||
let item = match manifest.get(idref.as_str()) {
|
||||
Some(item) => item,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Direct raster image in spine (rare but possible)
|
||||
if item.media_type.starts_with("image/") && !item.media_type.contains("svg") {
|
||||
if seen.insert(item.href.clone()) {
|
||||
images.push(item.href.clone());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read XHTML/SVG content — entry is dropped at end of match arm, releasing archive borrow
|
||||
let content = match archive.by_name(&item.href) {
|
||||
Ok(mut entry) => {
|
||||
let mut buf = Vec::new();
|
||||
match entry.read_to_end(&mut buf) {
|
||||
Ok(_) => String::from_utf8_lossy(&buf).to_string(),
|
||||
Err(_) => continue,
|
||||
}
|
||||
}
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
let content_dir = std::path::Path::new(&item.href)
|
||||
.parent()
|
||||
.map(|p| p.to_string_lossy().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
// Extract <img src="..."> and <image [xlink:]href="...">
|
||||
for re in [re_img, re_svg] {
|
||||
for cap in re.captures_iter(&content) {
|
||||
if let Some(src) = cap.get(1) {
|
||||
let src_str = src.as_str();
|
||||
if src_str.starts_with("data:") {
|
||||
continue;
|
||||
}
|
||||
let decoded = decode_xml_entities(&percent_decode_epub(src_str));
|
||||
let resolved = resolve_epub_path(&content_dir, &decoded);
|
||||
if seen.insert(resolved.clone()) {
|
||||
images.push(resolved);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Fallback: no images from spine → list all images in ZIP (CBZ-style)
|
||||
if images.is_empty() {
|
||||
for i in 0..archive.len() {
|
||||
if let Ok(entry) = archive.by_index(i) {
|
||||
let name = entry.name().to_string();
|
||||
if is_image_name(&name.to_ascii_lowercase()) && seen.insert(name.clone()) {
|
||||
images.push(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
images.sort_by(|a, b| natord::compare(a, b));
|
||||
}
|
||||
|
||||
if images.is_empty() {
|
||||
return Err(anyhow::anyhow!("no images found in epub: {}", path.display()));
|
||||
}
|
||||
|
||||
Ok(images)
|
||||
}
|
||||
|
||||
fn parse_epub_opf(
|
||||
xml: &str,
|
||||
opf_dir: &str,
|
||||
) -> Result<(HashMap<String, EpubManifestItem>, Vec<String>)> {
|
||||
let re_item = RE_EPUB_ITEM.get_or_init(|| {
|
||||
regex::Regex::new(r#"(?s)<(?:\w+:)?item\s([^>]+?)/?>"#).unwrap()
|
||||
});
|
||||
let re_itemref = RE_EPUB_ITEMREF.get_or_init(|| {
|
||||
regex::Regex::new(r#"<(?:\w+:)?itemref\s[^>]*idref="([^"]+)""#).unwrap()
|
||||
});
|
||||
let re_id = RE_EPUB_ATTR_ID.get_or_init(|| {
|
||||
regex::Regex::new(r#"(?:^|\s)id="([^"]+)""#).unwrap()
|
||||
});
|
||||
let re_href = RE_EPUB_ATTR_HREF.get_or_init(|| {
|
||||
regex::Regex::new(r#"(?:^|\s)href="([^"]+)""#).unwrap()
|
||||
});
|
||||
let re_media = RE_EPUB_ATTR_MEDIA.get_or_init(|| {
|
||||
regex::Regex::new(r#"media-type="([^"]+)""#).unwrap()
|
||||
});
|
||||
|
||||
let mut manifest: HashMap<String, EpubManifestItem> = HashMap::new();
|
||||
for cap in re_item.captures_iter(xml) {
|
||||
if let Some(attrs) = cap.get(1) {
|
||||
let a = attrs.as_str();
|
||||
let id = re_id.captures(a).and_then(|c| c.get(1));
|
||||
let href = re_href.captures(a).and_then(|c| c.get(1));
|
||||
let media = re_media.captures(a).and_then(|c| c.get(1));
|
||||
|
||||
if let (Some(id), Some(href), Some(media)) = (id, href, media) {
|
||||
let decoded_href = decode_xml_entities(&percent_decode_epub(href.as_str()));
|
||||
let resolved = resolve_epub_path(opf_dir, &decoded_href);
|
||||
manifest.insert(
|
||||
id.as_str().to_string(),
|
||||
EpubManifestItem {
|
||||
href: resolved,
|
||||
media_type: media.as_str().to_string(),
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let spine_idrefs: Vec<String> = re_itemref
|
||||
.captures_iter(xml)
|
||||
.filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
|
||||
.collect();
|
||||
|
||||
Ok((manifest, spine_idrefs))
|
||||
}
|
||||
|
||||
/// Get the cached image index for an EPUB, building it on first access.
|
||||
fn get_epub_image_index(path: &Path) -> Result<Vec<String>> {
|
||||
{
|
||||
let cache = epub_index_cache().lock().unwrap();
|
||||
if let Some(names) = cache.get(path) {
|
||||
return Ok(names.clone());
|
||||
}
|
||||
}
|
||||
let images = build_epub_image_index(path)?;
|
||||
{
|
||||
let mut cache = epub_index_cache().lock().unwrap();
|
||||
cache.insert(path.to_path_buf(), images.clone());
|
||||
}
|
||||
Ok(images)
|
||||
}
|
||||
|
||||
fn parse_epub_page_count(path: &Path) -> Result<i32> {
|
||||
let images = build_epub_image_index(path)?;
|
||||
Ok(images.len() as i32)
|
||||
}
|
||||
|
||||
fn analyze_epub(path: &Path) -> Result<(i32, Vec<u8>)> {
|
||||
let images = get_epub_image_index(path)?;
|
||||
let count = images.len() as i32;
|
||||
|
||||
let file = std::fs::File::open(path)
|
||||
.with_context(|| format!("cannot open epub: {}", path.display()))?;
|
||||
let mut archive = zip::ZipArchive::new(file)?;
|
||||
|
||||
for img_path in &images {
|
||||
if let Ok(mut entry) = archive.by_name(img_path) {
|
||||
let mut buf = Vec::new();
|
||||
if entry.read_to_end(&mut buf).is_ok() && !buf.is_empty() {
|
||||
return Ok((count, buf));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!(
|
||||
"no readable images in epub: {}",
|
||||
path.display()
|
||||
))
|
||||
}
|
||||
|
||||
fn extract_epub_page(path: &Path, page_number: u32) -> Result<Vec<u8>> {
|
||||
let images = get_epub_image_index(path)?;
|
||||
let index = page_number as usize - 1;
|
||||
let img_path = images
|
||||
.get(index)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"page {} out of range (total: {})",
|
||||
page_number,
|
||||
images.len()
|
||||
)
|
||||
})?;
|
||||
|
||||
let file = std::fs::File::open(path)
|
||||
.with_context(|| format!("cannot open epub: {}", path.display()))?;
|
||||
let mut archive = zip::ZipArchive::new(file)?;
|
||||
let mut entry = archive
|
||||
.by_name(img_path)
|
||||
.with_context(|| format!("image '{}' not found in epub", img_path))?;
|
||||
let mut buf = Vec::new();
|
||||
entry.read_to_end(&mut buf)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
// --- EPUB path/encoding helpers ---
|
||||
|
||||
fn resolve_epub_path(base_dir: &str, href: &str) -> String {
|
||||
if let Some(stripped) = href.strip_prefix('/') {
|
||||
return normalize_epub_path(stripped);
|
||||
}
|
||||
if base_dir.is_empty() {
|
||||
return normalize_epub_path(href);
|
||||
}
|
||||
normalize_epub_path(&format!("{}/{}", base_dir, href))
|
||||
}
|
||||
|
||||
fn normalize_epub_path(path: &str) -> String {
|
||||
let mut parts: Vec<&str> = Vec::new();
|
||||
for part in path.split('/') {
|
||||
match part {
|
||||
".." => {
|
||||
parts.pop();
|
||||
}
|
||||
"." | "" => {}
|
||||
_ => parts.push(part),
|
||||
}
|
||||
}
|
||||
parts.join("/")
|
||||
}
|
||||
|
||||
fn percent_decode_epub(s: &str) -> String {
|
||||
if !s.contains('%') {
|
||||
return s.to_string();
|
||||
}
|
||||
let bytes = s.as_bytes();
|
||||
let mut result = Vec::with_capacity(bytes.len());
|
||||
let mut i = 0;
|
||||
while i < bytes.len() {
|
||||
if bytes[i] == b'%' && i + 2 < bytes.len() {
|
||||
if let (Some(h), Some(l)) = (epub_hex_val(bytes[i + 1]), epub_hex_val(bytes[i + 2])) {
|
||||
result.push(h * 16 + l);
|
||||
i += 3;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
result.push(bytes[i]);
|
||||
i += 1;
|
||||
}
|
||||
String::from_utf8_lossy(&result).to_string()
|
||||
}
|
||||
|
||||
fn epub_hex_val(b: u8) -> Option<u8> {
|
||||
match b {
|
||||
b'0'..=b'9' => Some(b - b'0'),
|
||||
b'a'..=b'f' => Some(b - b'a' + 10),
|
||||
b'A'..=b'F' => Some(b - b'A' + 10),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_xml_entities(s: &str) -> String {
|
||||
if !s.contains('&') {
|
||||
return s.to_string();
|
||||
}
|
||||
s.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
}
|
||||
|
||||
/// Convert a CBR file to CBZ in-place (same directory, same stem).
|
||||
///
|
||||
/// The conversion is safe: a `.cbz.tmp` file is written first, verified, then
|
||||
|
||||
10
infra/migrations/0046_add_epub_format.sql
Normal file
10
infra/migrations/0046_add_epub_format.sql
Normal file
@@ -0,0 +1,10 @@
|
||||
-- Add EPUB to allowed format values in book_files and books tables.
|
||||
-- PostgreSQL CHECK constraints are dropped+recreated (no ALTER CONSTRAINT).
|
||||
|
||||
-- book_files.format
|
||||
ALTER TABLE book_files DROP CONSTRAINT IF EXISTS book_files_format_check;
|
||||
ALTER TABLE book_files ADD CONSTRAINT book_files_format_check CHECK (format IN ('pdf', 'cbz', 'cbr', 'epub'));
|
||||
|
||||
-- books.format (denormalized column added in 0020)
|
||||
ALTER TABLE books DROP CONSTRAINT IF EXISTS books_format_check;
|
||||
ALTER TABLE books ADD CONSTRAINT books_format_check CHECK (format IN ('pdf', 'cbz', 'cbr', 'epub'));
|
||||
Reference in New Issue
Block a user