feat: two-phase indexation with direct thumbnail generation in indexer

Phase 1 (discovery): walkdir + filename-only metadata, zero archive I/O. Books are visible immediately in the UI while Phase 2 runs in background. Phase 2 (analysis): open each archive once via analyze_book() to extract page_count and first page bytes, then generate WebP thumbnail directly in the indexer — removing the HTTP roundtrip to the API checkup endpoint. - Add parse_metadata_fast() (infallible, no archive I/O) - Add analyze_book() returning (page_count, first_page_bytes) in one pass - Add looks_like_image() magic bytes check for unrar p stdout validation - Add lsar fallback in list_cbr_images() for UTF-16BE encoded filenames - Add directory_mtimes table to skip unchanged dirs on incremental scans - Add analyzer.rs: generate_thumbnail, analyze_library_books, regenerate_thumbnails - Remove run_checkup() from API; indexer handles thumbnail jobs directly - Remove api_base_url/api_bootstrap_token from IndexerConfig and AppState - Add unar + poppler-utils to indexer Dockerfile - Fix smoke.sh: wait for job completion, check thumbnail_url field Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 22:13:05 +01:00
parent 36af34443e
commit cfc896e92f
22 changed files with 1274 additions and 768 deletions
--- a/crates/core/src/config.rs
+++ b/crates/core/src/config.rs
@@ -32,10 +32,6 @@ pub struct IndexerConfig {
    pub meili_master_key: String,
    pub scan_interval_seconds: u64,
    pub thumbnail_config: ThumbnailConfig,
-    /// API base URL for thumbnail checkup at end of build (e.g. http://api:7080)
-    pub api_base_url: String,
-    /// Token to call API (e.g. API_BOOTSTRAP_TOKEN)
-    pub api_bootstrap_token: String,
 }

 #[derive(Debug, Clone)]
@@ -97,10 +93,6 @@ impl IndexerConfig {
                .and_then(|v| v.parse::<u64>().ok())
                .unwrap_or(5),
            thumbnail_config,
-            api_base_url: std::env::var("API_BASE_URL")
-                .unwrap_or_else(|_| "http://api:7080".to_string()),
-            api_bootstrap_token: std::env::var("API_BOOTSTRAP_TOKEN")
-                .context("API_BOOTSTRAP_TOKEN is required for thumbnail checkup")?,
        })
    }
 }
--- a/crates/parsers/src/lib.rs
+++ b/crates/parsers/src/lib.rs
@@ -2,6 +2,7 @@ use anyhow::{Context, Result};
 use std::io::Read;
 use std::path::Path;
 use std::process::Command;
+use std::sync::OnceLock;
 use uuid::Uuid;
 use walkdir::WalkDir;

@@ -40,38 +41,52 @@ pub fn detect_format(path: &Path) -> Option<BookFormat> {
    }
 }

-pub fn parse_metadata(
-    path: &Path,
-    format: BookFormat,
-    library_root: &Path,
-) -> Result<ParsedMetadata> {
-    let filename = path
-        .file_stem()
-        .map(|s| s.to_string_lossy().to_string())
-        .unwrap_or_else(|| "Untitled".to_string());
+// Cache compiled regex patterns — compiled once on first use
+static VOLUME_PATTERNS: OnceLock<Vec<(regex::Regex, usize)>> = OnceLock::new();

-    // Extract volume from filename (patterns: T01, T02, Vol 1, Volume 1, #1, - 01, etc.)
-    let volume = extract_volume(&filename);
+fn get_volume_patterns() -> &'static Vec<(regex::Regex, usize)> {
+    VOLUME_PATTERNS.get_or_init(|| {
+        [
+            // T01, T02 pattern (most common for manga/comics)
+            (r"(?i)T(\d+)", 1usize),
+            // Vol 1, Vol. 1, Volume 1
+            (r"(?i)Vol\.?\s*(\d+)", 1),
+            (r"(?i)Volume\s*(\d+)", 1),
+            // #1, #01
+            (r"#(\d+)", 1),
+            // - 1, - 01 at the end
+            (r"-\s*(\d+)\s*$", 1),
+        ]
+        .iter()
+        .filter_map(|(pattern, group)| {
+            regex::Regex::new(pattern).ok().map(|re| (re, *group))
+        })
+        .collect()
+    })
+}

-    // Keep original filename as title (don't clean it)
-    let title = filename;
+fn extract_volume(filename: &str) -> Option<i32> {
+    for (re, group) in get_volume_patterns() {
+        if let Some(caps) = re.captures(filename) {
+            if let Some(mat) = caps.get(*group) {
+                return mat.as_str().parse::<i32>().ok();
+            }
+        }
+    }
+    None
+}

-    // Determine series from parent folder relative to library root
-    let series = path.parent().and_then(|parent| {
-        // Normalize paths for comparison (handle different separators, etc.)
+fn extract_series(path: &Path, library_root: &Path) -> Option<String> {
+    path.parent().and_then(|parent| {
        let parent_str = parent.to_string_lossy().to_string();
        let root_str = library_root.to_string_lossy().to_string();

-        // Try to find the library root in the parent path
        let relative = if let Some(idx) = parent_str.find(&root_str) {
-            // Found root in parent, extract what comes after
            let after_root = &parent_str[idx + root_str.len()..];
            Path::new(after_root)
-        } else if let Some(relative) = parent.strip_prefix(library_root).ok() {
-            // Standard approach works
+        } else if let Ok(relative) = parent.strip_prefix(library_root) {
            relative
        } else {
-            // Log for diagnostic on server
            eprintln!(
                "[PARSER] Cannot determine series: parent '{}' doesn't start with root '{}'",
                parent.display(),
@@ -80,16 +95,14 @@ pub fn parse_metadata(
            return None;
        };

-        // Remove leading separators
        let relative_str = relative.to_string_lossy().to_string();
-        let relative_clean = relative_str.trim_start_matches(|c| c == '/' || c == '\\');
+        let relative_clean = relative_str.trim_start_matches(['/', '\\']);

        if relative_clean.is_empty() {
            return None;
        }

-        // Get first component as series
-        let first_sep = relative_clean.find(|c| c == '/' || c == '\\');
+        let first_sep = relative_clean.find(['/', '\\']);
        let series_name = match first_sep {
            Some(idx) => &relative_clean[..idx],
            None => relative_clean,
@@ -100,80 +113,178 @@ pub fn parse_metadata(
        } else {
            Some(series_name.to_string())
        }
-    });
+    })
+}

-    let page_count = match format {
+/// Fast metadata extraction from filename only — no archive I/O. Always succeeds.
+pub fn parse_metadata_fast(path: &Path, _format: BookFormat, library_root: &Path) -> ParsedMetadata {
+    let filename = path
+        .file_stem()
+        .map(|s| s.to_string_lossy().to_string())
+        .unwrap_or_else(|| "Untitled".to_string());
+
+    let volume = extract_volume(&filename);
+    let title = filename;
+    let series = extract_series(path, library_root);
+
+    ParsedMetadata {
+        title,
+        series,
+        volume,
+        page_count: None,
+    }
+}
+
+pub fn parse_metadata(
+    path: &Path,
+    format: BookFormat,
+    library_root: &Path,
+) -> Result<ParsedMetadata> {
+    let mut meta = parse_metadata_fast(path, format, library_root);
+
+    meta.page_count = match format {
        BookFormat::Cbz => parse_cbz_page_count(path).ok(),
        BookFormat::Cbr => parse_cbr_page_count(path).ok(),
        BookFormat::Pdf => parse_pdf_page_count(path).ok(),
    };

-    Ok(ParsedMetadata {
-        title,
-        series,
-        volume,
-        page_count,
-    })
+    Ok(meta)
 }

-fn extract_volume(filename: &str) -> Option<i32> {
-    // Common volume patterns: T01, T02, T1, T2, Vol 1, Vol. 1, Volume 1, #1, #01, - 1, - 01
-    let patterns = [
-        // T01, T02 pattern (most common for manga/comics)
-        (r"(?i)T(\d+)", 1),
-        // Vol 1, Vol. 1, Volume 1
-        (r"(?i)Vol\.?\s*(\d+)", 1),
-        (r"(?i)Volume\s*(\d+)", 1),
-        // #1, #01
-        (r"#(\d+)", 1),
-        // - 1, - 01 at the end
-        (r"-\s*(\d+)\s*$", 1),
-    ];
+/// Open an archive once and return (page_count, first_page_bytes).
+/// This is more efficient than calling parse_metadata + extract_first_page separately.
+pub fn analyze_book(path: &Path, format: BookFormat) -> Result<(i32, Vec<u8>)> {
+    match format {
+        BookFormat::Cbz => analyze_cbz(path),
+        BookFormat::Cbr => analyze_cbr(path),
+        BookFormat::Pdf => analyze_pdf(path),
+    }
+}

-    for (pattern, group) in &patterns {
-        if let Ok(re) = regex::Regex::new(pattern) {
-            if let Some(caps) = re.captures(filename) {
-                if let Some(mat) = caps.get(*group) {
-                    // Parse as integer to remove leading zeros
-                    return mat.as_str().parse::<i32>().ok();
-                }
-            }
+fn analyze_cbz(path: &Path) -> Result<(i32, Vec<u8>)> {
+    let file = std::fs::File::open(path)
+        .with_context(|| format!("cannot open cbz: {}", path.display()))?;
+    let mut archive = zip::ZipArchive::new(file).context("invalid cbz archive")?;
+
+    let mut image_names: Vec<String> = Vec::new();
+    for i in 0..archive.len() {
+        let entry = archive.by_index(i).context("cannot read cbz entry")?;
+        let name = entry.name().to_ascii_lowercase();
+        if is_image_name(&name) {
+            image_names.push(entry.name().to_string());
+        }
+    }
+    image_names.sort();
+
+    let count = image_names.len() as i32;
+    let first_image = image_names.first().context("no images found in cbz")?;
+
+    let mut entry = archive
+        .by_name(first_image)
+        .context("cannot read first image")?;
+    let mut buf = Vec::new();
+    entry.read_to_end(&mut buf)?;
+
+    Ok((count, buf))
+}
+
+fn list_cbr_images(path: &Path) -> Result<Vec<String>> {
+    // Try unrar lb first (fast)
+    let output = std::process::Command::new("unrar")
+        .arg("lb")
+        .arg(path)
+        .output()
+        .with_context(|| format!("failed to execute unrar lb for {}", path.display()))?;
+
+    if output.status.success() {
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        let images: Vec<String> = stdout
+            .lines()
+            .filter(|line| is_image_name(&line.to_ascii_lowercase()))
+            .map(|l| l.to_string())
+            .collect();
+        if !images.is_empty() {
+            return Ok(images);
        }
    }

-    None
+    // Fallback: lsar (from unar package) handles UTF-16BE encoded filenames
+    let lsar_output = std::process::Command::new("lsar")
+        .arg(path)
+        .output()
+        .with_context(|| format!("failed to execute lsar for {}", path.display()))?;
+
+    if !lsar_output.status.success() {
+        return Err(anyhow::anyhow!(
+            "both unrar lb and lsar failed for {}",
+            path.display()
+        ));
+    }
+
+    let stdout = String::from_utf8_lossy(&lsar_output.stdout);
+    // lsar output: first line is archive info, then one file per line (indented)
+    let images: Vec<String> = stdout
+        .lines()
+        .skip(1) // skip the archive header line
+        .map(|l| l.trim().to_string())
+        .filter(|line| is_image_name(&line.to_ascii_lowercase()))
+        .collect();
+
+    Ok(images)
 }

-#[allow(dead_code)]
-fn clean_title(filename: &str) -> String {
-    // Remove volume patterns from title to clean it up
-    let cleaned = regex::Regex::new(r"(?i)\s*T\d+\s*")
-        .ok()
-        .and_then(|re| Some(re.replace_all(filename, " ").to_string()))
-        .unwrap_or_else(|| filename.to_string());
+fn analyze_cbr(path: &Path) -> Result<(i32, Vec<u8>)> {
+    let mut image_names = list_cbr_images(path)?;
+    image_names.sort();

-    let cleaned = regex::Regex::new(r"(?i)\s*Vol\.?\s*\d+\s*")
-        .ok()
-        .and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
-        .unwrap_or_else(|| cleaned);
+    let count = image_names.len() as i32;
+    if count == 0 {
+        return Err(anyhow::anyhow!("no images found in cbr: {}", path.display()));
+    }

-    let cleaned = regex::Regex::new(r"(?i)\s*Volume\s*\d+\s*")
-        .ok()
-        .and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
-        .unwrap_or_else(|| cleaned);
+    let first_name = &image_names[0];

-    let cleaned = regex::Regex::new(r"#\d+")
-        .ok()
-        .and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
-        .unwrap_or_else(|| cleaned);
+    // Try unrar p to extract first image to stdout (faster — no temp dir)
+    let p_output = std::process::Command::new("unrar")
+        .args(["p", "-inul"])
+        .arg(path)
+        .arg(first_name)
+        .output();

-    let cleaned = regex::Regex::new(r"-\s*\d+\s*$")
-        .ok()
-        .and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
-        .unwrap_or_else(|| cleaned);
+    match p_output {
+        Ok(out) if out.status.success() && looks_like_image(&out.stdout) => Ok((count, out.stdout)),
+        _ => {
+            // Fallback: full extraction with unar (handles special chars, encoding issues)
+            let image_bytes = extract_cbr_first_page(path)?;
+            Ok((count, image_bytes))
+        }
+    }
+}

-    // Clean up extra spaces
-    cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
+/// Check image magic bytes to validate that bytes are a real image before decoding.
+fn looks_like_image(bytes: &[u8]) -> bool {
+    if bytes.len() < 12 {
+        return false;
+    }
+    // JPEG: FF D8 FF
+    if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
+        return true;
+    }
+    // PNG: 89 50 4E 47 0D 0A 1A 0A
+    if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
+        return true;
+    }
+    // WebP: RIFF....WEBP
+    if &bytes[0..4] == b"RIFF" && &bytes[8..12] == b"WEBP" {
+        return true;
+    }
+    false
+}
+
+fn analyze_pdf(path: &Path) -> Result<(i32, Vec<u8>)> {
+    let count = parse_pdf_page_count(path)?;
+    let image_bytes = extract_pdf_first_page(path)?;
+    Ok((count, image_bytes))
 }

 fn parse_cbz_page_count(path: &Path) -> Result<i32> {
@@ -192,26 +303,11 @@ fn parse_cbz_page_count(path: &Path) -> Result<i32> {
 }

 fn parse_cbr_page_count(path: &Path) -> Result<i32> {
-    let output = std::process::Command::new("unrar")
-        .arg("lb")
-        .arg(path)
-        .output()
-        .with_context(|| format!("failed to execute unrar for {}", path.display()))?;
-
-    if !output.status.success() {
-        return Err(anyhow::anyhow!("unrar failed for {}", path.display()));
-    }
-
-    let stdout = String::from_utf8_lossy(&output.stdout);
-    let count = stdout
-        .lines()
-        .filter(|line| is_image_name(&line.to_ascii_lowercase()))
-        .count() as i32;
-    Ok(count)
+    let images = list_cbr_images(path)?;
+    Ok(images.len() as i32)
 }

 fn parse_pdf_page_count(path: &Path) -> Result<i32> {
-    // Use pdfinfo command line tool instead of lopdf for better performance
    let output = std::process::Command::new("pdfinfo")
        .arg(path)
        .output()
@@ -238,6 +334,10 @@ fn parse_pdf_page_count(path: &Path) -> Result<i32> {
 }

 fn is_image_name(name: &str) -> bool {
+    // Skip macOS metadata entries (__MACOSX/ prefix or AppleDouble ._* files)
+    if name.starts_with("__macosx/") || name.contains("/._") || name.starts_with("._") {
+        return false;
+    }
    name.ends_with(".jpg")
        || name.ends_with(".jpeg")
        || name.ends_with(".png")
@@ -282,7 +382,6 @@ fn extract_cbr_first_page(path: &Path) -> Result<Vec<u8>> {
    let tmp_dir = std::env::temp_dir().join(format!("stripstream-cbr-thumb-{}", Uuid::new_v4()));
    std::fs::create_dir_all(&tmp_dir).context("cannot create temp dir")?;

-    // Use env command like the API does
    let output = std::process::Command::new("env")
        .args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"])
        .arg(&tmp_dir)
@@ -298,7 +397,6 @@ fn extract_cbr_first_page(path: &Path) -> Result<Vec<u8>> {
        ));
    }

-    // Use WalkDir for recursive search (CBR can have subdirectories)
    let mut image_files: Vec<_> = WalkDir::new(&tmp_dir)
        .into_iter()
        .filter_map(|e| e.ok())
@@ -346,3 +444,33 @@ fn extract_pdf_first_page(path: &Path) -> Result<Vec<u8>> {
    let _ = std::fs::remove_dir_all(&tmp_dir);
    Ok(data)
 }
+
+#[allow(dead_code)]
+fn clean_title(filename: &str) -> String {
+    let cleaned = regex::Regex::new(r"(?i)\s*T\d+\s*")
+        .ok()
+        .map(|re| re.replace_all(filename, " ").to_string())
+        .unwrap_or_else(|| filename.to_string());
+
+    let cleaned = regex::Regex::new(r"(?i)\s*Vol\.?\s*\d+\s*")
+        .ok()
+        .map(|re| re.replace_all(&cleaned, " ").to_string())
+        .unwrap_or(cleaned);
+
+    let cleaned = regex::Regex::new(r"(?i)\s*Volume\s*\d+\s*")
+        .ok()
+        .map(|re| re.replace_all(&cleaned, " ").to_string())
+        .unwrap_or(cleaned);
+
+    let cleaned = regex::Regex::new(r"#\d+")
+        .ok()
+        .map(|re| re.replace_all(&cleaned, " ").to_string())
+        .unwrap_or(cleaned);
+
+    let cleaned = regex::Regex::new(r"-\s*\d+\s*$")
+        .ok()
+        .map(|re| re.replace_all(&cleaned, " ").to_string())
+        .unwrap_or(cleaned);
+
+    cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
+}