fix(parsers): corriger la génération de thumbnails CBR/CBZ/PDF

- CBR: contourner le bug XADRegexException de unar en appelant unar avec un symlink à nom neutre (archive.cbr) au lieu du chemin réel, qui peut contenir des caractères regex spéciaux comme [ ] ( ) - CBR/CBZ: remplacer le tri lexicographique par natord (tri naturel) pour que page2.jpg soit trié avant page10.jpg - PDF: brancher pdftoppm -scale-to sur config.width.max(config.height) au lieu d'une valeur hardcodée (800px → 400px par défaut) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-11 16:17:20 +01:00
parent 1c106a4ff2
commit f2d9bedcc7
5 changed files with 47 additions and 22 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1485,6 +1485,12 @@ dependencies = [
 "pxfm",
 ]
 [[package]]
 name = "natord"
 version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "308d96db8debc727c3fd9744aac51751243420e46edf401010908da7f8d5e57c"
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -1627,6 +1633,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "lopdf",
 "natord",
 "regex",
 "uuid",
 "walkdir",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,7 @@ tower = { version = "0.5", features = ["limit"] }
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
 uuid = { version = "1.12", features = ["serde", "v4"] }
 natord = "1.0"
 walkdir = "2.5"
 webp = "0.3"
 utoipa = "4.0"
--- a/apps/indexer/src/analyzer.rs
+++ b/apps/indexer/src/analyzer.rs
@@ -247,8 +247,9 @@ pub async fn analyze_library_books(
                // Run blocking archive I/O on a thread pool
                let book_id = task.book_id;
                let path_owned = path.to_path_buf();
                let pdf_scale = config.width.max(config.height);
                let analyze_result = tokio::task::spawn_blocking(move || {
-                    analyze_book(&path_owned, format)
+                    analyze_book(&path_owned, format, pdf_scale)
                })
                .await;
--- a/crates/parsers/Cargo.toml
+++ b/crates/parsers/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 natord.workspace = true
 lopdf = "0.35"
 regex = "1"
 uuid.workspace = true
--- a/crates/parsers/src/lib.rs
+++ b/crates/parsers/src/lib.rs
@@ -153,11 +153,12 @@ pub fn parse_metadata(
 /// Open an archive once and return (page_count, first_page_bytes).
 /// This is more efficient than calling parse_metadata + extract_first_page separately.
-pub fn analyze_book(path: &Path, format: BookFormat) -> Result<(i32, Vec<u8>)> {
+/// `pdf_render_scale`: max dimension (width or height) used by pdftoppm; 0 means use default (400).
 pub fn analyze_book(path: &Path, format: BookFormat, pdf_render_scale: u32) -> Result<(i32, Vec<u8>)> {
    match format {
        BookFormat::Cbz => analyze_cbz(path),
        BookFormat::Cbr => analyze_cbr(path),
-        BookFormat::Pdf => analyze_pdf(path),
+        BookFormat::Pdf => analyze_pdf(path, pdf_render_scale),
    }
 }
@@ -174,7 +175,7 @@ fn analyze_cbz(path: &Path) -> Result<(i32, Vec<u8>)> {
            image_names.push(entry.name().to_string());
        }
    }
-    image_names.sort();
+    image_names.sort_by(|a, b| natord::compare(a, b));
    let count = image_names.len() as i32;
    let first_image = image_names.first().context("no images found in cbz")?;
@@ -198,12 +199,13 @@ fn list_cbr_images(path: &Path) -> Result<Vec<String>> {
    if output.status.success() {
        let stdout = String::from_utf8_lossy(&output.stdout);
-        let images: Vec<String> = stdout
+        let mut images: Vec<String> = stdout
            .lines()
            .map(|l| l.trim().to_string())
            .filter(|line| is_image_name(&line.to_ascii_lowercase()))
            .collect();
        if !images.is_empty() {
            images.sort_by(|a, b| natord::compare(a, b));
            return Ok(images);
        }
    }
@@ -223,12 +225,13 @@ fn list_cbr_images(path: &Path) -> Result<Vec<String>> {
    let stdout = String::from_utf8_lossy(&lsar_output.stdout);
    // lsar output: first line is archive info, then one file per line (indented)
-    let images: Vec<String> = stdout
+    let mut images: Vec<String> = stdout
        .lines()
        .skip(1) // skip the archive header line
        .map(|l| l.trim().to_string())
        .filter(|line| is_image_name(&line.to_ascii_lowercase()))
        .collect();
    images.sort_by(|a, b| natord::compare(a, b));
    Ok(images)
 }
@@ -281,9 +284,9 @@ fn looks_like_image(bytes: &[u8]) -> bool {
    false
 }
-fn analyze_pdf(path: &Path) -> Result<(i32, Vec<u8>)> {
+fn analyze_pdf(path: &Path, pdf_render_scale: u32) -> Result<(i32, Vec<u8>)> {
    let count = parse_pdf_page_count(path)?;
-    let image_bytes = extract_pdf_first_page(path)?;
+    let image_bytes = extract_pdf_first_page(path, pdf_render_scale)?;
    Ok((count, image_bytes))
 }
@@ -354,7 +357,7 @@ pub fn extract_first_page(path: &Path, format: BookFormat) -> Result<Vec<u8>> {
            let first_name = image_names.into_iter().next().context("no images found in cbr")?;
            extract_cbr_first_page(path, &first_name)
        }
-        BookFormat::Pdf => extract_pdf_first_page(path),
+        BookFormat::Pdf => extract_pdf_first_page(path, 0),
    }
 }
@@ -371,7 +374,7 @@ fn extract_cbz_first_page(path: &Path) -> Result<Vec<u8>> {
            image_names.push(entry.name().to_string());
        }
    }
-    image_names.sort();
+    image_names.sort_by(|a, b| natord::compare(a, b));
    let first_image = image_names.first().context("no images found in cbz")?;
@@ -383,26 +386,36 @@ fn extract_cbz_first_page(path: &Path) -> Result<Vec<u8>> {
    Ok(buf)
 }
-fn extract_cbr_first_page(path: &Path, first_name: &str) -> Result<Vec<u8>> {
+fn extract_cbr_first_page(path: &Path, _first_name: &str) -> Result<Vec<u8>> {
-    let tmp_dir = std::env::temp_dir().join(format!("stripstream-cbr-thumb-{}", Uuid::new_v4()));
+    let work_dir = std::env::temp_dir().join(format!("stripstream-cbr-thumb-{}", Uuid::new_v4()));
-    std::fs::create_dir_all(&tmp_dir).context("cannot create temp dir")?;
+    let extract_dir = work_dir.join("out");
    std::fs::create_dir_all(&extract_dir).context("cannot create temp dir")?;
    // unar constructs internal regexes from (archive_path + "/" + internal_path).
    // Archive filenames containing regex special chars like `[`, `]`, `(`, `)` cause
    // XADRegexException. Work around by giving unar a safe symlink name.
    let safe_path = work_dir.join("archive.cbr");
    if std::os::unix::fs::symlink(path, &safe_path).is_err() {
        // Cross-filesystem fallback: copy (slower but safe)
        std::fs::copy(path, &safe_path).context("cannot copy cbr to temp dir")?;
    }
    let output = std::process::Command::new("env")
        .args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"])
-        .arg(&tmp_dir)
+        .arg(&extract_dir)
-        .arg(path)
+        .arg(&safe_path)
        .output()
        .context("unar failed")?;
    if !output.status.success() {
-        let _ = std::fs::remove_dir_all(&tmp_dir);
+        let _ = std::fs::remove_dir_all(&work_dir);
        return Err(anyhow::anyhow!(
            "unar extract failed: {:?}",
            String::from_utf8_lossy(&output.stderr)
        ));
    }
-    let mut image_files: Vec<_> = WalkDir::new(&tmp_dir)
+    let mut image_files: Vec<_> = WalkDir::new(&extract_dir)
        .into_iter()
        .filter_map(|e| e.ok())
        .filter(|e| {
@@ -411,19 +424,21 @@ fn extract_cbr_first_page(path: &Path, first_name: &str) -> Result<Vec<u8>> {
        })
        .collect();
-    image_files.sort_by_key(|e| e.path().to_string_lossy().to_lowercase());
+    image_files.sort_by(|a, b| natord::compare(&a.path().to_string_lossy(), &b.path().to_string_lossy()));
    let first_image = image_files.first().context("no images found in cbr")?;
    let data = std::fs::read(first_image.path())?;
-    let _ = std::fs::remove_dir_all(&tmp_dir);
+    let _ = std::fs::remove_dir_all(&work_dir);
    Ok(data)
 }
-fn extract_pdf_first_page(path: &Path) -> Result<Vec<u8>> {
+fn extract_pdf_first_page(path: &Path, pdf_render_scale: u32) -> Result<Vec<u8>> {
    let tmp_dir = std::env::temp_dir().join(format!("stripstream-pdf-thumb-{}", Uuid::new_v4()));
    std::fs::create_dir_all(&tmp_dir)?;
    let output_prefix = tmp_dir.join("page");
    let scale = if pdf_render_scale == 0 { 400 } else { pdf_render_scale };
    let scale_str = scale.to_string();
    let output = Command::new("pdftoppm")
        .args([
@@ -432,7 +447,7 @@ fn extract_pdf_first_page(path: &Path) -> Result<Vec<u8>> {
            "-singlefile",
            "-png",
            "-scale-to",
-            "800",
+            &scale_str,
            path.to_str().unwrap(),
            output_prefix.to_str().unwrap(),
        ])
@@ -511,7 +526,7 @@ pub fn convert_cbr_to_cbz(cbr_path: &Path) -> Result<PathBuf> {
            is_image_name(&name)
        })
        .collect();
-    image_files.sort_by_key(|e| e.path().to_string_lossy().to_lowercase());
+    image_files.sort_by(|a, b| natord::compare(&a.path().to_string_lossy(), &b.path().to_string_lossy()));
    let image_count = image_files.len();
    if image_count == 0 {