From 3b6cc2903d2e4e7c3ea838eaf9583611499155d9 Mon Sep 17 00:00:00 2001 From: Froidefond Julien Date: Wed, 11 Mar 2026 16:52:15 +0100 Subject: [PATCH] perf(api): remplacer unar/pdftoppm par unrar crate et pdfium-render MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CBR: extract_cbr_page extrayait TOUT le CBR sur disque pour lire une seule page. Reécrit avec le crate unrar : listing en mémoire + extraction ciblée de la page demandée uniquement. Zéro subprocess, zéro temp dir. PDF: render_pdf_page utilisait pdftoppm subprocess + temp dir. Reécrit avec pdfium-render in-process. Zéro subprocess, zéro temp dir. CBZ: sort naturel (natord) pour l'ordre des pages. Dockerfile API: retire unar et poppler-utils, ajoute libpdfium.so. Co-Authored-By: Claude Sonnet 4.6 --- Cargo.lock | 4 +- apps/api/Cargo.toml | 4 +- apps/api/Dockerfile | 16 +++- apps/api/src/pages.rs | 170 +++++++++++++++++++----------------------- 4 files changed, 99 insertions(+), 95 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e438589..e2cee2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -62,6 +62,8 @@ dependencies = [ "futures", "image", "lru", + "natord", + "pdfium-render", "rand 0.8.5", "reqwest", "serde", @@ -75,10 +77,10 @@ dependencies = [ "tower-http", "tracing", "tracing-subscriber", + "unrar", "utoipa", "utoipa-swagger-ui", "uuid", - "walkdir", "webp", "zip 2.4.2", ] diff --git a/apps/api/Cargo.toml b/apps/api/Cargo.toml index 8bb69b1..28f42e9 100644 --- a/apps/api/Cargo.toml +++ b/apps/api/Cargo.toml @@ -28,8 +28,10 @@ tower-http = { version = "0.6", features = ["cors"] } tracing.workspace = true tracing-subscriber.workspace = true uuid.workspace = true +natord.workspace = true +pdfium-render.workspace = true +unrar.workspace = true zip = { version = "2.2", default-features = false, features = ["deflate"] } utoipa.workspace = true utoipa-swagger-ui = { workspace = true, features = ["axum"] } webp.workspace = true -walkdir = "2" diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index 2495efa..dd58858 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -22,12 +22,26 @@ RUN --mount=type=cache,target=/sccache \ cargo install sqlx-cli --no-default-features --features postgres --locked FROM debian:bookworm-slim + RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates wget unar poppler-utils locales postgresql-client \ + ca-certificates wget locales postgresql-client \ && rm -rf /var/lib/apt/lists/* RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 + +# Download pdfium shared library (replaces pdftoppm subprocess) +RUN ARCH=$(dpkg --print-architecture) && \ + case "$ARCH" in \ + amd64) PDFIUM_ARCH="linux-x64" ;; \ + arm64) PDFIUM_ARCH="linux-arm64" ;; \ + *) echo "Unsupported arch: $ARCH" && exit 1 ;; \ + esac && \ + wget -q "https://github.com/bblanchon/pdfium-binaries/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -O /tmp/pdfium.tgz && \ + tar -xzf /tmp/pdfium.tgz -C /tmp && \ + cp /tmp/lib/libpdfium.so /usr/local/lib/ && \ + rm -rf /tmp/pdfium.tgz /tmp/lib /tmp/include && \ + ldconfig COPY --from=builder /app/target/release/api /usr/local/bin/api COPY --from=builder /usr/local/cargo/bin/sqlx /usr/local/bin/sqlx COPY infra/migrations /app/migrations diff --git a/apps/api/src/pages.rs b/apps/api/src/pages.rs index d048e66..59f07c6 100644 --- a/apps/api/src/pages.rs +++ b/apps/api/src/pages.rs @@ -18,7 +18,6 @@ use sha2::{Digest, Sha256}; use sqlx::Row; use tracing::{debug, error, info, instrument, warn}; use uuid::Uuid; -use walkdir::WalkDir; use crate::{error::ApiError, state::AppState}; @@ -389,7 +388,7 @@ fn extract_cbz_page(abs_path: &str, page_number: u32) -> Result, ApiErro image_names.push(entry.name().to_string()); } } - image_names.sort(); + image_names.sort_by(|a, b| natord::compare(a, b)); debug!("Found {} images in CBZ {}", image_names.len(), abs_path); let index = page_number as usize - 1; @@ -413,107 +412,94 @@ fn extract_cbz_page(abs_path: &str, page_number: u32) -> Result, ApiErro fn extract_cbr_page(abs_path: &str, page_number: u32) -> Result, ApiError> { info!("Opening CBR archive: {}", abs_path); - let index = page_number as usize - 1; - let tmp_dir = std::env::temp_dir().join(format!("stripstream-cbr-{}", Uuid::new_v4())); - debug!("Creating temp dir for CBR extraction: {}", tmp_dir.display()); - - std::fs::create_dir_all(&tmp_dir).map_err(|e| { - error!("Cannot create temp dir: {}", e); - ApiError::internal(format!("temp dir error: {}", e)) - })?; - // Extract directly - skip listing which fails on UTF-16 encoded filenames - let extract_output = std::process::Command::new("env") - .args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"]) - .arg(&tmp_dir) - .arg(abs_path) - .output() - .map_err(|e| { - let _ = std::fs::remove_dir_all(&tmp_dir); - error!("unar extract failed: {}", e); - ApiError::internal(format!("unar extract failed: {e}")) - })?; + // Pass 1: list all image names (in-process, no subprocess) + let mut image_names: Vec = { + let archive = unrar::Archive::new(abs_path) + .open_for_listing() + .map_err(|e| ApiError::internal(format!("unrar listing failed: {}", e)))?; + let mut names = Vec::new(); + for entry in archive { + let entry = entry.map_err(|e| ApiError::internal(format!("unrar entry error: {}", e)))?; + let name = entry.filename.to_string_lossy().to_string(); + if is_image_name(&name.to_ascii_lowercase()) { + names.push(name); + } + } + names + }; - if !extract_output.status.success() { - let _ = std::fs::remove_dir_all(&tmp_dir); - let stderr = String::from_utf8_lossy(&extract_output.stderr); - error!("unar extract failed {}: {}", abs_path, stderr); - return Err(ApiError::internal("unar extract failed")); + image_names.sort_by(|a, b| natord::compare(a, b)); + + let target = image_names + .get(index) + .ok_or_else(|| { + error!("Page {} out of range (total: {})", page_number, image_names.len()); + ApiError::not_found("page out of range") + })? + .clone(); + + // Pass 2: extract only the target page to memory + let mut archive = unrar::Archive::new(abs_path) + .open_for_processing() + .map_err(|e| ApiError::internal(format!("unrar processing failed: {}", e)))?; + + while let Some(header) = archive + .read_header() + .map_err(|e| ApiError::internal(format!("unrar read header: {}", e)))? + { + let entry_name = header.entry().filename.to_string_lossy().to_string(); + if entry_name == target { + let (data, _) = header + .read() + .map_err(|e| ApiError::internal(format!("unrar read: {}", e)))?; + info!("Extracted CBR page {} ({} bytes)", page_number, data.len()); + return Ok(data); + } + archive = header + .skip() + .map_err(|e| ApiError::internal(format!("unrar skip: {}", e)))?; } - // Find and read the requested image (recursive search for CBR files with subdirectories) - let mut image_files: Vec<_> = WalkDir::new(&tmp_dir) - .into_iter() - .filter_map(|e| e.ok()) - .filter(|e| { - let name = e.file_name().to_string_lossy().to_lowercase(); - is_image_name(&name) - }) - .collect(); - - image_files.sort_by_key(|e| e.path().to_string_lossy().to_lowercase()); - - let selected = image_files.get(index).ok_or_else(|| { - let _ = std::fs::remove_dir_all(&tmp_dir); - error!("Page {} not found (total: {})", page_number, image_files.len()); - ApiError::not_found("page out of range") - })?; - - let data = std::fs::read(selected.path()).map_err(|e| { - let _ = std::fs::remove_dir_all(&tmp_dir); - error!("read failed: {}", e); - ApiError::internal(format!("read error: {}", e)) - })?; - - let _ = std::fs::remove_dir_all(&tmp_dir); - - info!("Successfully extracted CBR page {} ({} bytes)", page_number, data.len()); - Ok(data) + Err(ApiError::not_found("page not found in archive")) } fn render_pdf_page(abs_path: &str, page_number: u32, width: u32) -> Result, ApiError> { - let tmp_dir = std::env::temp_dir().join(format!("stripstream-pdf-{}", Uuid::new_v4())); - debug!("Creating temp dir for PDF rendering: {}", tmp_dir.display()); - std::fs::create_dir_all(&tmp_dir).map_err(|e| { - error!("Cannot create temp dir {}: {}", tmp_dir.display(), e); - ApiError::internal(format!("cannot create temp dir: {e}")) - })?; - let output_prefix = tmp_dir.join("page"); + use pdfium_render::prelude::*; - let mut cmd = std::process::Command::new("pdftoppm"); - cmd.arg("-f") - .arg(page_number.to_string()) - .arg("-singlefile") - .arg("-png"); - if width > 0 { - cmd.arg("-scale-to-x").arg(width.to_string()).arg("-scale-to-y").arg("-1"); - } - cmd.arg(abs_path).arg(&output_prefix); + debug!("Rendering PDF page {} of {} (width: {})", page_number, abs_path, width); - debug!("Running pdftoppm for page {} of {} (width: {})", page_number, abs_path, width); - let output = cmd - .output() - .map_err(|e| { - error!("pdftoppm command failed for {} page {}: {}", abs_path, page_number, e); - ApiError::internal(format!("pdf render failed: {e}")) - })?; - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - let _ = std::fs::remove_dir_all(&tmp_dir); - error!("pdftoppm failed for {} page {}: {}", abs_path, page_number, stderr); - return Err(ApiError::internal("pdf render command failed")); - } + let pdfium = Pdfium::new( + Pdfium::bind_to_system_library() + .map_err(|e| ApiError::internal(format!("pdfium not available: {:?}", e)))?, + ); - let image_path = output_prefix.with_extension("png"); - debug!("Reading rendered PDF page from: {}", image_path.display()); - let bytes = std::fs::read(&image_path).map_err(|e| { - error!("Failed to read rendered PDF output {}: {}", image_path.display(), e); - ApiError::internal(format!("render output missing: {e}")) - })?; - let _ = std::fs::remove_dir_all(&tmp_dir); - debug!("Successfully rendered PDF page {} to {} bytes", page_number, bytes.len()); - Ok(bytes) + let document = pdfium + .load_pdf_from_file(abs_path, None) + .map_err(|e| ApiError::internal(format!("pdf load failed: {:?}", e)))?; + + let page_index = (page_number - 1) as u16; + let page = document + .pages() + .get(page_index) + .map_err(|_| ApiError::not_found("page out of range"))?; + + let render_width = if width > 0 { width as i32 } else { 1200 }; + let config = PdfRenderConfig::new().set_target_width(render_width); + + let bitmap = page + .render_with_config(&config) + .map_err(|e| ApiError::internal(format!("pdf render failed: {:?}", e)))?; + + let image = bitmap.as_image(); + let mut buf = std::io::Cursor::new(Vec::new()); + image + .write_to(&mut buf, image::ImageFormat::Png) + .map_err(|e| ApiError::internal(format!("png encode failed: {}", e)))?; + + debug!("Rendered PDF page {} ({} bytes)", page_number, buf.get_ref().len()); + Ok(buf.into_inner()) } fn transcode_image(input: &[u8], out_format: &OutputFormat, quality: u8, width: u32, filter: image::imageops::FilterType) -> Result, ApiError> {