Compare commits

...

3 Commits

Author SHA1 Message Date
3bd2fb7c1f feat(jobs): introduce extracting_pages status and update job progress handling
- Added a new job status 'extracting_pages' to represent the first sub-phase of thumbnail generation.
- Updated the database schema to include a timestamp for when thumbnail generation starts.
- Enhanced job progress components to handle the new status, including UI updates for displaying progress and status labels.
- Refactored job-related logic to accommodate the two-phase process: extracting pages and generating thumbnails.
- Adjusted SQL queries and job detail responses to include the new fields and statuses.

This change improves the clarity of job processing states and enhances user feedback during the thumbnail generation process.
2026-03-11 17:50:48 +01:00
3b6cc2903d perf(api): remplacer unar/pdftoppm par unrar crate et pdfium-render
CBR: extract_cbr_page extrayait TOUT le CBR sur disque pour lire une
seule page. Reécrit avec le crate unrar : listing en mémoire + extraction
ciblée de la page demandée uniquement. Zéro subprocess, zéro temp dir.

PDF: render_pdf_page utilisait pdftoppm subprocess + temp dir. Reécrit
avec pdfium-render in-process. Zéro subprocess, zéro temp dir.

CBZ: sort naturel (natord) pour l'ordre des pages.

Dockerfile API: retire unar et poppler-utils, ajoute libpdfium.so.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-11 16:52:15 +01:00
6abaa96fba perf(parsers): remplacer tous les subprocesses par des libs in-process
CBR: remplace unrar/unar CLI par le crate `unrar` (bindings libunrar
vendorisé, zéro dépendance système). Supprime XADRegexException, les
forks de processus et les dossiers temporaires.

PDF: remplace pdfinfo + pdftoppm par pdfium-render. Le PDF est ouvert
une seule fois pour obtenir le nombre de pages ET rasteriser la première
page. lopdf reste pour parse_metadata (page count seul).

convert_cbr_to_cbz: reécrit sans subprocess ni dossier temporaire —
les images sont lues en mémoire via unrar puis packées directement en ZIP.

Dockerfile indexer: retire unrar-free, unar, poppler-utils. Télécharge
libpdfium.so depuis bblanchon/pdfium-binaries au build.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-11 16:46:43 +01:00
18 changed files with 698 additions and 520 deletions

155
Cargo.lock generated
View File

@@ -62,6 +62,8 @@ dependencies = [
"futures",
"image",
"lru",
"natord",
"pdfium-render",
"rand 0.8.5",
"reqwest",
"serde",
@@ -75,10 +77,10 @@ dependencies = [
"tower-http",
"tracing",
"tracing-subscriber",
"unrar",
"utoipa",
"utoipa-swagger-ui",
"uuid",
"walkdir",
"webp",
"zip 2.4.2",
]
@@ -369,6 +371,26 @@ dependencies = [
"crossbeam-utils",
]
[[package]]
name = "console_error_panic_hook"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc"
dependencies = [
"cfg-if",
"wasm-bindgen",
]
[[package]]
name = "console_log"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be8aed40e4edbf4d3b4431ab260b63fdc40f5780a4766824329ea0f1eefe3c0f"
dependencies = [
"log",
"web-sys",
]
[[package]]
name = "const-oid"
version = "0.9.6"
@@ -1224,6 +1246,15 @@ dependencies = [
"serde",
]
[[package]]
name = "itertools"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.17"
@@ -1291,6 +1322,16 @@ version = "0.2.182"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112"
[[package]]
name = "libloading"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
dependencies = [
"cfg-if",
"windows-link",
]
[[package]]
name = "libm"
version = "0.2.16"
@@ -1404,6 +1445,12 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
[[package]]
name = "maybe-owned"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4"
[[package]]
name = "md-5"
version = "0.10.6"
@@ -1632,11 +1679,12 @@ name = "parsers"
version = "0.1.0"
dependencies = [
"anyhow",
"image",
"lopdf",
"natord",
"pdfium-render",
"regex",
"uuid",
"walkdir",
"unrar",
"zip 2.4.2",
]
@@ -1651,6 +1699,32 @@ dependencies = [
"subtle",
]
[[package]]
name = "pdfium-render"
version = "0.8.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6553f6604a52b3203db7b4e9d51eb4dd193cf455af9e56d40cab6575b547b679"
dependencies = [
"bitflags 2.11.0",
"bytemuck",
"bytes",
"chrono",
"console_error_panic_hook",
"console_log",
"image",
"itertools",
"js-sys",
"libloading",
"log",
"maybe-owned",
"once_cell",
"utf16string",
"vecmath",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]]
name = "pem-rfc7468"
version = "0.7.0"
@@ -1678,6 +1752,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "piston-float"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad78bf43dcf80e8f950c92b84f938a0fc7590b7f6866fbcbeca781609c115590"
[[package]]
name = "pkcs1"
version = "0.7.5"
@@ -2940,6 +3020,29 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "unrar"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92ec61343a630d2b50d13216dea5125e157d3fc180a7d3f447d22fe146b648fc"
dependencies = [
"bitflags 2.11.0",
"regex",
"unrar_sys",
"widestring",
]
[[package]]
name = "unrar_sys"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b77675b883cfbe6bf41e6b7a5cd6008e0a83ba497de3d96e41a064bbeead765"
dependencies = [
"cc",
"libc",
"winapi",
]
[[package]]
name = "untrusted"
version = "0.9.0"
@@ -2958,6 +3061,15 @@ dependencies = [
"serde",
]
[[package]]
name = "utf16string"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b62a1e85e12d5d712bf47a85f426b73d303e2d00a90de5f3004df3596e9d216"
dependencies = [
"byteorder",
]
[[package]]
name = "utf8_iter"
version = "1.0.4"
@@ -3028,6 +3140,15 @@ version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "vecmath"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "956ae1e0d85bca567dee1dcf87fb1ca2e792792f66f87dced8381f99cd91156a"
dependencies = [
"piston-float",
]
[[package]]
name = "version_check"
version = "0.9.5"
@@ -3240,6 +3361,28 @@ dependencies = [
"wasite",
]
[[package]]
name = "widestring"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72069c3113ab32ab29e5584db3c6ec55d416895e60715417b5b883a357c3e471"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.11"
@@ -3249,6 +3392,12 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-core"
version = "0.62.2"

View File

@@ -33,6 +33,8 @@ tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
uuid = { version = "1.12", features = ["serde", "v4"] }
natord = "1.0"
pdfium-render = { version = "0.8", default-features = false, features = ["pdfium_latest", "image_latest", "thread_safe"] }
unrar = "0.5"
walkdir = "2.5"
webp = "0.3"
utoipa = "4.0"

View File

@@ -28,8 +28,10 @@ tower-http = { version = "0.6", features = ["cors"] }
tracing.workspace = true
tracing-subscriber.workspace = true
uuid.workspace = true
natord.workspace = true
pdfium-render.workspace = true
unrar.workspace = true
zip = { version = "2.2", default-features = false, features = ["deflate"] }
utoipa.workspace = true
utoipa-swagger-ui = { workspace = true, features = ["axum"] }
webp.workspace = true
walkdir = "2"

View File

@@ -22,12 +22,26 @@ RUN --mount=type=cache,target=/sccache \
cargo install sqlx-cli --no-default-features --features postgres --locked
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates wget unar poppler-utils locales postgresql-client \
ca-certificates wget locales postgresql-client \
&& rm -rf /var/lib/apt/lists/*
RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen
ENV LANG=en_US.UTF-8
ENV LC_ALL=en_US.UTF-8
# Download pdfium shared library (replaces pdftoppm subprocess)
RUN ARCH=$(dpkg --print-architecture) && \
case "$ARCH" in \
amd64) PDFIUM_ARCH="linux-x64" ;; \
arm64) PDFIUM_ARCH="linux-arm64" ;; \
*) echo "Unsupported arch: $ARCH" && exit 1 ;; \
esac && \
wget -q "https://github.com/bblanchon/pdfium-binaries/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -O /tmp/pdfium.tgz && \
tar -xzf /tmp/pdfium.tgz -C /tmp && \
cp /tmp/lib/libpdfium.so /usr/local/lib/ && \
rm -rf /tmp/pdfium.tgz /tmp/lib /tmp/include && \
ldconfig
COPY --from=builder /app/target/release/api /usr/local/bin/api
COPY --from=builder /usr/local/cargo/bin/sqlx /usr/local/bin/sqlx
COPY infra/migrations /app/migrations

View File

@@ -65,6 +65,8 @@ pub struct IndexJobDetailResponse {
pub finished_at: Option<DateTime<Utc>>,
#[schema(value_type = Option<String>)]
pub phase2_started_at: Option<DateTime<Utc>>,
#[schema(value_type = Option<String>)]
pub generating_thumbnails_started_at: Option<DateTime<Utc>>,
pub stats_json: Option<serde_json::Value>,
pub error_opt: Option<String>,
#[schema(value_type = String)]
@@ -324,6 +326,7 @@ fn map_row_detail(row: sqlx::postgres::PgRow) -> IndexJobDetailResponse {
started_at: row.get("started_at"),
finished_at: row.get("finished_at"),
phase2_started_at: row.try_get("phase2_started_at").ok().flatten(),
generating_thumbnails_started_at: row.try_get("generating_thumbnails_started_at").ok().flatten(),
stats_json: row.get("stats_json"),
error_opt: row.get("error_opt"),
created_at: row.get("created_at"),
@@ -350,7 +353,7 @@ pub async fn get_active_jobs(State(state): State<AppState>) -> Result<Json<Vec<I
let rows = sqlx::query(
"SELECT id, library_id, book_id, type, status, started_at, finished_at, stats_json, error_opt, created_at, progress_percent, processed_files, total_files
FROM index_jobs
WHERE status IN ('pending', 'running', 'generating_thumbnails')
WHERE status IN ('pending', 'running', 'extracting_pages', 'generating_thumbnails')
ORDER BY created_at ASC"
)
.fetch_all(&state.pool)
@@ -380,7 +383,7 @@ pub async fn get_job_details(
id: axum::extract::Path<Uuid>,
) -> Result<Json<IndexJobDetailResponse>, ApiError> {
let row = sqlx::query(
"SELECT id, library_id, book_id, type, status, started_at, finished_at, phase2_started_at,
"SELECT id, library_id, book_id, type, status, started_at, finished_at, phase2_started_at, generating_thumbnails_started_at,
stats_json, error_opt, created_at, current_file, progress_percent, total_files, processed_files
FROM index_jobs WHERE id = $1"
)

View File

@@ -18,7 +18,6 @@ use sha2::{Digest, Sha256};
use sqlx::Row;
use tracing::{debug, error, info, instrument, warn};
use uuid::Uuid;
use walkdir::WalkDir;
use crate::{error::ApiError, state::AppState};
@@ -389,7 +388,7 @@ fn extract_cbz_page(abs_path: &str, page_number: u32) -> Result<Vec<u8>, ApiErro
image_names.push(entry.name().to_string());
}
}
image_names.sort();
image_names.sort_by(|a, b| natord::compare(a, b));
debug!("Found {} images in CBZ {}", image_names.len(), abs_path);
let index = page_number as usize - 1;
@@ -413,107 +412,94 @@ fn extract_cbz_page(abs_path: &str, page_number: u32) -> Result<Vec<u8>, ApiErro
fn extract_cbr_page(abs_path: &str, page_number: u32) -> Result<Vec<u8>, ApiError> {
info!("Opening CBR archive: {}", abs_path);
let index = page_number as usize - 1;
let tmp_dir = std::env::temp_dir().join(format!("stripstream-cbr-{}", Uuid::new_v4()));
debug!("Creating temp dir for CBR extraction: {}", tmp_dir.display());
std::fs::create_dir_all(&tmp_dir).map_err(|e| {
error!("Cannot create temp dir: {}", e);
ApiError::internal(format!("temp dir error: {}", e))
})?;
// Pass 1: list all image names (in-process, no subprocess)
let mut image_names: Vec<String> = {
let archive = unrar::Archive::new(abs_path)
.open_for_listing()
.map_err(|e| ApiError::internal(format!("unrar listing failed: {}", e)))?;
let mut names = Vec::new();
for entry in archive {
let entry = entry.map_err(|e| ApiError::internal(format!("unrar entry error: {}", e)))?;
let name = entry.filename.to_string_lossy().to_string();
if is_image_name(&name.to_ascii_lowercase()) {
names.push(name);
}
}
names
};
// Extract directly - skip listing which fails on UTF-16 encoded filenames
let extract_output = std::process::Command::new("env")
.args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"])
.arg(&tmp_dir)
.arg(abs_path)
.output()
.map_err(|e| {
let _ = std::fs::remove_dir_all(&tmp_dir);
error!("unar extract failed: {}", e);
ApiError::internal(format!("unar extract failed: {e}"))
})?;
image_names.sort_by(|a, b| natord::compare(a, b));
if !extract_output.status.success() {
let _ = std::fs::remove_dir_all(&tmp_dir);
let stderr = String::from_utf8_lossy(&extract_output.stderr);
error!("unar extract failed {}: {}", abs_path, stderr);
return Err(ApiError::internal("unar extract failed"));
let target = image_names
.get(index)
.ok_or_else(|| {
error!("Page {} out of range (total: {})", page_number, image_names.len());
ApiError::not_found("page out of range")
})?
.clone();
// Pass 2: extract only the target page to memory
let mut archive = unrar::Archive::new(abs_path)
.open_for_processing()
.map_err(|e| ApiError::internal(format!("unrar processing failed: {}", e)))?;
while let Some(header) = archive
.read_header()
.map_err(|e| ApiError::internal(format!("unrar read header: {}", e)))?
{
let entry_name = header.entry().filename.to_string_lossy().to_string();
if entry_name == target {
let (data, _) = header
.read()
.map_err(|e| ApiError::internal(format!("unrar read: {}", e)))?;
info!("Extracted CBR page {} ({} bytes)", page_number, data.len());
return Ok(data);
}
archive = header
.skip()
.map_err(|e| ApiError::internal(format!("unrar skip: {}", e)))?;
}
// Find and read the requested image (recursive search for CBR files with subdirectories)
let mut image_files: Vec<_> = WalkDir::new(&tmp_dir)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| {
let name = e.file_name().to_string_lossy().to_lowercase();
is_image_name(&name)
})
.collect();
image_files.sort_by_key(|e| e.path().to_string_lossy().to_lowercase());
let selected = image_files.get(index).ok_or_else(|| {
let _ = std::fs::remove_dir_all(&tmp_dir);
error!("Page {} not found (total: {})", page_number, image_files.len());
ApiError::not_found("page out of range")
})?;
let data = std::fs::read(selected.path()).map_err(|e| {
let _ = std::fs::remove_dir_all(&tmp_dir);
error!("read failed: {}", e);
ApiError::internal(format!("read error: {}", e))
})?;
let _ = std::fs::remove_dir_all(&tmp_dir);
info!("Successfully extracted CBR page {} ({} bytes)", page_number, data.len());
Ok(data)
Err(ApiError::not_found("page not found in archive"))
}
fn render_pdf_page(abs_path: &str, page_number: u32, width: u32) -> Result<Vec<u8>, ApiError> {
let tmp_dir = std::env::temp_dir().join(format!("stripstream-pdf-{}", Uuid::new_v4()));
debug!("Creating temp dir for PDF rendering: {}", tmp_dir.display());
std::fs::create_dir_all(&tmp_dir).map_err(|e| {
error!("Cannot create temp dir {}: {}", tmp_dir.display(), e);
ApiError::internal(format!("cannot create temp dir: {e}"))
})?;
let output_prefix = tmp_dir.join("page");
use pdfium_render::prelude::*;
let mut cmd = std::process::Command::new("pdftoppm");
cmd.arg("-f")
.arg(page_number.to_string())
.arg("-singlefile")
.arg("-png");
if width > 0 {
cmd.arg("-scale-to-x").arg(width.to_string()).arg("-scale-to-y").arg("-1");
}
cmd.arg(abs_path).arg(&output_prefix);
debug!("Rendering PDF page {} of {} (width: {})", page_number, abs_path, width);
debug!("Running pdftoppm for page {} of {} (width: {})", page_number, abs_path, width);
let output = cmd
.output()
.map_err(|e| {
error!("pdftoppm command failed for {} page {}: {}", abs_path, page_number, e);
ApiError::internal(format!("pdf render failed: {e}"))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let _ = std::fs::remove_dir_all(&tmp_dir);
error!("pdftoppm failed for {} page {}: {}", abs_path, page_number, stderr);
return Err(ApiError::internal("pdf render command failed"));
}
let pdfium = Pdfium::new(
Pdfium::bind_to_system_library()
.map_err(|e| ApiError::internal(format!("pdfium not available: {:?}", e)))?,
);
let image_path = output_prefix.with_extension("png");
debug!("Reading rendered PDF page from: {}", image_path.display());
let bytes = std::fs::read(&image_path).map_err(|e| {
error!("Failed to read rendered PDF output {}: {}", image_path.display(), e);
ApiError::internal(format!("render output missing: {e}"))
})?;
let _ = std::fs::remove_dir_all(&tmp_dir);
debug!("Successfully rendered PDF page {} to {} bytes", page_number, bytes.len());
Ok(bytes)
let document = pdfium
.load_pdf_from_file(abs_path, None)
.map_err(|e| ApiError::internal(format!("pdf load failed: {:?}", e)))?;
let page_index = (page_number - 1) as u16;
let page = document
.pages()
.get(page_index)
.map_err(|_| ApiError::not_found("page out of range"))?;
let render_width = if width > 0 { width as i32 } else { 1200 };
let config = PdfRenderConfig::new().set_target_width(render_width);
let bitmap = page
.render_with_config(&config)
.map_err(|e| ApiError::internal(format!("pdf render failed: {:?}", e)))?;
let image = bitmap.as_image();
let mut buf = std::io::Cursor::new(Vec::new());
image
.write_to(&mut buf, image::ImageFormat::Png)
.map_err(|e| ApiError::internal(format!("png encode failed: {}", e)))?;
debug!("Rendered PDF page {} ({} bytes)", page_number, buf.get_ref().len());
Ok(buf.into_inner())
}
fn transcode_image(input: &[u8], out_format: &OutputFormat, quality: u8, width: u32, filter: image::imageops::FilterType) -> Result<Vec<u8>, ApiError> {

View File

@@ -87,8 +87,8 @@ export function JobProgress({ jobId, onComplete }: JobProgressProps) {
const percent = progress.progress_percent ?? 0;
const processed = progress.processed_files ?? 0;
const total = progress.total_files ?? 0;
const isThumbnailsPhase = progress.status === "generating_thumbnails";
const unitLabel = isThumbnailsPhase ? "thumbnails" : "files";
const isPhase2 = progress.status === "extracting_pages" || progress.status === "generating_thumbnails";
const unitLabel = progress.status === "extracting_pages" ? "pages" : progress.status === "generating_thumbnails" ? "thumbnails" : "files";
return (
<div className="p-4 bg-card rounded-lg border border-border">
@@ -112,7 +112,7 @@ export function JobProgress({ jobId, onComplete }: JobProgressProps) {
)}
</div>
{progress.stats_json && !isThumbnailsPhase && (
{progress.stats_json && !isPhase2 && (
<div className="flex flex-wrap gap-3 text-xs">
<Badge variant="primary">Scanned: {progress.stats_json.scanned_files}</Badge>
<Badge variant="success">Indexed: {progress.stats_json.indexed_files}</Badge>

View File

@@ -33,7 +33,7 @@ interface JobRowProps {
}
export function JobRow({ job, libraryName, highlighted, onCancel, formatDate, formatDuration }: JobRowProps) {
const isActive = job.status === "running" || job.status === "pending" || job.status === "generating_thumbnails";
const isActive = job.status === "running" || job.status === "pending" || job.status === "extracting_pages" || job.status === "generating_thumbnails";
const [showProgress, setShowProgress] = useState(highlighted || isActive);
const handleComplete = () => {
@@ -52,13 +52,14 @@ export function JobRow({ job, libraryName, highlighted, onCancel, formatDate, fo
const removed = job.stats_json?.removed_files ?? 0;
const errors = job.stats_json?.errors ?? 0;
const isPhase2 = job.status === "extracting_pages" || job.status === "generating_thumbnails";
const isThumbnailPhase = job.status === "generating_thumbnails";
const isThumbnailJob = job.type === "thumbnail_rebuild" || job.type === "thumbnail_regenerate";
const hasThumbnailPhase = isThumbnailPhase || isThumbnailJob;
const hasThumbnailPhase = isPhase2 || isThumbnailJob;
// Files column: index-phase stats only
// Files column: index-phase stats only (Phase 1 discovery)
const filesDisplay =
job.status === "running" && !isThumbnailPhase
job.status === "running" && !isPhase2
? job.total_files != null
? `${job.processed_files ?? 0}/${job.total_files}`
: scanned > 0
@@ -70,8 +71,8 @@ export function JobRow({ job, libraryName, highlighted, onCancel, formatDate, fo
? `${scanned} scanned`
: "—";
// Thumbnails column
const thumbInProgress = hasThumbnailPhase && (job.status === "running" || isThumbnailPhase);
// Thumbnails column (Phase 2: extracting_pages + generating_thumbnails)
const thumbInProgress = hasThumbnailPhase && (job.status === "running" || isPhase2);
const thumbDisplay =
thumbInProgress && job.total_files != null
? `${job.processed_files ?? 0}/${job.total_files}`
@@ -128,7 +129,7 @@ export function JobRow({ job, libraryName, highlighted, onCancel, formatDate, fo
{errors > 0 && <span className="text-error"> {errors}</span>}
</div>
)}
{job.status === "running" && !isThumbnailPhase && job.total_files != null && (
{job.status === "running" && !isPhase2 && job.total_files != null && (
<MiniProgressBar value={job.processed_files ?? 0} max={job.total_files} className="w-24" />
)}
</div>
@@ -155,7 +156,7 @@ export function JobRow({ job, libraryName, highlighted, onCancel, formatDate, fo
>
View
</Link>
{(job.status === "pending" || job.status === "running" || job.status === "generating_thumbnails") && (
{(job.status === "pending" || job.status === "running" || job.status === "extracting_pages" || job.status === "generating_thumbnails") && (
<Button
variant="danger"
size="sm"

View File

@@ -78,7 +78,7 @@ export function JobsIndicator() {
return () => document.removeEventListener("mousedown", handleClickOutside);
}, []);
const runningJobs = activeJobs.filter(j => j.status === "running" || j.status === "generating_thumbnails");
const runningJobs = activeJobs.filter(j => j.status === "running" || j.status === "extracting_pages" || j.status === "generating_thumbnails");
const pendingJobs = activeJobs.filter(j => j.status === "pending");
const totalCount = activeJobs.length;
@@ -222,7 +222,7 @@ export function JobsIndicator() {
>
<div className="flex items-start gap-3">
<div className="mt-0.5">
{(job.status === "running" || job.status === "generating_thumbnails") && <span className="animate-spin inline-block"></span>}
{(job.status === "running" || job.status === "extracting_pages" || job.status === "generating_thumbnails") && <span className="animate-spin inline-block"></span>}
{job.status === "pending" && <span></span>}
</div>
@@ -234,7 +234,7 @@ export function JobsIndicator() {
</Badge>
</div>
{(job.status === "running" || job.status === "generating_thumbnails") && job.progress_percent != null && (
{(job.status === "running" || job.status === "extracting_pages" || job.status === "generating_thumbnails") && job.progress_percent != null && (
<div className="flex items-center gap-2 mt-2">
<MiniProgressBar value={job.progress_percent} />
<span className="text-xs font-medium text-muted-foreground">{job.progress_percent}%</span>

View File

@@ -60,6 +60,7 @@ export function Badge({ children, variant = "default", className = "" }: BadgePr
// Status badge for jobs/tasks
const statusVariants: Record<string, BadgeVariant> = {
running: "in-progress",
extracting_pages: "in-progress",
generating_thumbnails: "in-progress",
success: "completed",
completed: "completed",
@@ -70,6 +71,7 @@ const statusVariants: Record<string, BadgeVariant> = {
};
const statusLabels: Record<string, string> = {
extracting_pages: "Extracting pages",
generating_thumbnails: "Thumbnails",
};

View File

@@ -20,6 +20,7 @@ interface JobDetails {
started_at: string | null;
finished_at: string | null;
phase2_started_at: string | null;
generating_thumbnails_started_at: string | null;
current_file: string | null;
progress_percent: number | null;
processed_files: number | null;
@@ -123,21 +124,27 @@ export default async function JobDetailPage({ params }: JobDetailPageProps) {
const isCompleted = job.status === "success";
const isFailed = job.status === "failed";
const isCancelled = job.status === "cancelled";
const isExtractingPages = job.status === "extracting_pages";
const isThumbnailPhase = job.status === "generating_thumbnails";
const isPhase2 = isExtractingPages || isThumbnailPhase;
const { isThumbnailOnly } = typeInfo;
// Which label to use for the progress card
const progressTitle = isThumbnailOnly
? "Thumbnails"
: isThumbnailPhase
? "Phase 2 — Thumbnails"
: "Phase 1 — Discovery";
: isExtractingPages
? "Phase 2 — Extracting pages"
: isThumbnailPhase
? "Phase 2 — Thumbnails"
: "Phase 1 — Discovery";
const progressDescription = isThumbnailOnly
? undefined
: isThumbnailPhase
? "Generating thumbnails for the analyzed books"
: "Scanning and indexing files in the library";
: isExtractingPages
? "Extracting first page from each archive (page count + raw image)"
: isThumbnailPhase
? "Generating thumbnails for the analyzed books"
: "Scanning and indexing files in the library";
// Speed metric: thumbnail count for thumbnail jobs, scanned files for index jobs
const speedCount = isThumbnailOnly
@@ -145,7 +152,7 @@ export default async function JobDetailPage({ params }: JobDetailPageProps) {
: (job.stats_json?.scanned_files ?? 0);
const showProgressCard =
(isCompleted || isFailed || job.status === "running" || isThumbnailPhase) &&
(isCompleted || isFailed || job.status === "running" || isPhase2) &&
(job.total_files != null || !!job.current_file);
return (
@@ -312,20 +319,44 @@ export default async function JobDetailPage({ params }: JobDetailPageProps) {
</div>
)}
{/* Phase 2 start — for index jobs that have two phases */}
{job.phase2_started_at && (
{/* Phase 2a — Extracting pages (index jobs with phase2) */}
{job.phase2_started_at && !isThumbnailOnly && (
<div className="flex items-start gap-4">
<div className={`w-3.5 h-3.5 rounded-full mt-0.5 shrink-0 z-10 ${
job.generating_thumbnails_started_at || job.finished_at ? "bg-primary" : "bg-primary animate-pulse"
}`} />
<div className="flex-1 min-w-0">
<span className="text-sm font-medium text-foreground">Phase 2a Extracting pages</span>
<p className="text-xs text-muted-foreground">{new Date(job.phase2_started_at).toLocaleString()}</p>
<p className="text-xs text-primary/80 font-medium mt-0.5">
Duration: {formatDuration(job.phase2_started_at, job.generating_thumbnails_started_at ?? job.finished_at ?? null)}
{!job.generating_thumbnails_started_at && !job.finished_at && isExtractingPages && (
<span className="text-muted-foreground font-normal ml-1">· in progress</span>
)}
</p>
</div>
</div>
)}
{/* Phase 2b — Generating thumbnails */}
{(job.generating_thumbnails_started_at || (job.phase2_started_at && isThumbnailOnly)) && (
<div className="flex items-start gap-4">
<div className={`w-3.5 h-3.5 rounded-full mt-0.5 shrink-0 z-10 ${
job.finished_at ? "bg-primary" : "bg-primary animate-pulse"
}`} />
<div className="flex-1 min-w-0">
<span className="text-sm font-medium text-foreground">
{isThumbnailOnly ? "Thumbnails" : "Phase 2 — Thumbnails"}
{isThumbnailOnly ? "Thumbnails" : "Phase 2bGenerating thumbnails"}
</span>
<p className="text-xs text-muted-foreground">{new Date(job.phase2_started_at).toLocaleString()}</p>
{job.finished_at && (
<p className="text-xs text-muted-foreground">
{(job.generating_thumbnails_started_at ? new Date(job.generating_thumbnails_started_at) : job.phase2_started_at ? new Date(job.phase2_started_at) : null)?.toLocaleString()}
</p>
{(job.generating_thumbnails_started_at || job.finished_at) && (
<p className="text-xs text-primary/80 font-medium mt-0.5">
Duration: {formatDuration(job.phase2_started_at, job.finished_at)}
Duration: {formatDuration(
job.generating_thumbnails_started_at ?? job.phase2_started_at!,
job.finished_at ?? null
)}
{job.total_files != null && job.total_files > 0 && (
<span className="text-muted-foreground font-normal ml-1">
· {job.processed_files ?? job.total_files} thumbnails
@@ -333,6 +364,9 @@ export default async function JobDetailPage({ params }: JobDetailPageProps) {
)}
</p>
)}
{!job.finished_at && isThumbnailPhase && (
<span className="text-xs text-muted-foreground">in progress</span>
)}
</div>
</div>
)}
@@ -393,7 +427,7 @@ export default async function JobDetailPage({ params }: JobDetailPageProps) {
<div className="grid grid-cols-3 gap-4">
<StatBox
value={job.processed_files ?? 0}
label={isThumbnailOnly || isThumbnailPhase ? "Generated" : "Processed"}
label={isThumbnailOnly || isPhase2 ? "Generated" : "Processed"}
variant="primary"
/>
<StatBox value={job.total_files} label="Total" />

File diff suppressed because one or more lines are too long

View File

@@ -21,11 +21,24 @@ RUN --mount=type=cache,target=/sccache \
cargo build --release -p indexer
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates wget \
unrar-free unar \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*
# Download pdfium shared library (replaces pdftoppm + pdfinfo subprocesses)
RUN ARCH=$(dpkg --print-architecture) && \
case "$ARCH" in \
amd64) PDFIUM_ARCH="linux-x64" ;; \
arm64) PDFIUM_ARCH="linux-arm64" ;; \
*) echo "Unsupported arch: $ARCH" && exit 1 ;; \
esac && \
wget -q "https://github.com/bblanchon/pdfium-binaries/releases/latest/download/pdfium-${PDFIUM_ARCH}.tgz" -O /tmp/pdfium.tgz && \
tar -xzf /tmp/pdfium.tgz -C /tmp && \
cp /tmp/lib/libpdfium.so /usr/local/lib/ && \
rm -rf /tmp/pdfium.tgz /tmp/lib /tmp/include && \
ldconfig
COPY --from=builder /app/target/release/indexer /usr/local/bin/indexer
EXPOSE 7081
CMD ["/usr/local/bin/indexer"]

View File

@@ -103,17 +103,32 @@ fn generate_thumbnail(image_bytes: &[u8], config: &ThumbnailConfig) -> anyhow::R
Ok(webp_data.to_vec())
}
fn save_thumbnail(
/// Save raw image bytes (as extracted from the archive) without any processing.
fn save_raw_image(book_id: Uuid, raw_bytes: &[u8], directory: &str) -> anyhow::Result<String> {
let dir = Path::new(directory);
std::fs::create_dir_all(dir)?;
let path = dir.join(format!("{}.raw", book_id));
std::fs::write(&path, raw_bytes)?;
Ok(path.to_string_lossy().to_string())
}
/// Resize the raw image and save it as a WebP thumbnail, overwriting the raw file.
fn resize_raw_to_webp(
book_id: Uuid,
thumbnail_bytes: &[u8],
raw_path: &str,
config: &ThumbnailConfig,
) -> anyhow::Result<String> {
let dir = Path::new(&config.directory);
std::fs::create_dir_all(dir)?;
let filename = format!("{}.webp", book_id);
let path = dir.join(&filename);
std::fs::write(&path, thumbnail_bytes)?;
Ok(path.to_string_lossy().to_string())
let raw_bytes = std::fs::read(raw_path)
.map_err(|e| anyhow::anyhow!("failed to read raw image {}: {}", raw_path, e))?;
let webp_bytes = generate_thumbnail(&raw_bytes, config)?;
let webp_path = Path::new(&config.directory).join(format!("{}.webp", book_id));
std::fs::write(&webp_path, &webp_bytes)?;
// Delete the raw file now that the WebP is written
let _ = std::fs::remove_file(raw_path);
Ok(webp_path.to_string_lossy().to_string())
}
fn book_format_from_str(s: &str) -> Option<BookFormat> {
@@ -125,7 +140,14 @@ fn book_format_from_str(s: &str) -> Option<BookFormat> {
}
}
/// Phase 2 — Analysis: open each unanalyzed archive once, extract page_count + thumbnail.
/// Phase 2 — Two-sub-phase analysis:
///
/// **Sub-phase A (extracting_pages)**: open each archive once, extract (page_count, raw_image_bytes),
/// save the raw bytes to `{directory}/{book_id}.raw`. I/O bound — runs at `concurrent_renders`.
///
/// **Sub-phase B (generating_thumbnails)**: load each `.raw` file, resize and encode as WebP,
/// overwrite as `{directory}/{book_id}.webp`. CPU bound — runs at `concurrent_renders`.
///
/// `thumbnail_only` = true: only process books missing thumbnail (page_count may already be set).
/// `thumbnail_only` = false: process books missing page_count.
pub async fn analyze_library_books(
@@ -143,7 +165,6 @@ pub async fn analyze_library_books(
let concurrency = load_thumbnail_concurrency(&state.pool).await;
// Query books that need analysis
let query_filter = if thumbnail_only {
"b.thumbnail_path IS NULL"
} else {
@@ -177,19 +198,7 @@ pub async fn analyze_library_books(
total, thumbnail_only, concurrency
);
// Update job status
let _ = sqlx::query(
"UPDATE index_jobs SET status = 'generating_thumbnails', total_files = $2, processed_files = 0, current_file = NULL WHERE id = $1",
)
.bind(job_id)
.bind(total)
.execute(&state.pool)
.await;
let processed_count = Arc::new(AtomicI32::new(0));
let cancelled_flag = Arc::new(AtomicBool::new(false));
// Background task: poll DB every 2s to detect cancellation
let cancel_pool = state.pool.clone();
let cancel_flag_for_poller = cancelled_flag.clone();
let cancel_handle = tokio::spawn(async move {
@@ -221,43 +230,56 @@ pub async fn analyze_library_books(
})
.collect();
stream::iter(tasks)
.for_each_concurrent(concurrency, |task| {
let processed_count = processed_count.clone();
// -------------------------------------------------------------------------
// Sub-phase A: extract first page from each archive and store raw image
// I/O bound — limited by HDD throughput, runs at `concurrency`
// -------------------------------------------------------------------------
let phase_a_start = std::time::Instant::now();
let _ = sqlx::query(
"UPDATE index_jobs SET status = 'extracting_pages', total_files = $2, processed_files = 0, current_file = NULL WHERE id = $1",
)
.bind(job_id)
.bind(total)
.execute(&state.pool)
.await;
let extracted_count = Arc::new(AtomicI32::new(0));
// Collected results: (book_id, raw_path, page_count)
let extracted: Vec<(Uuid, String, i32)> = stream::iter(tasks)
.map(|task| {
let pool = state.pool.clone();
let config = config.clone();
let cancelled = cancelled_flag.clone();
let extracted_count = extracted_count.clone();
async move {
if cancelled.load(Ordering::Relaxed) {
return;
return None;
}
let local_path = utils::remap_libraries_path(&task.abs_path);
let path = Path::new(&local_path);
let path = std::path::Path::new(&local_path);
let book_id = task.book_id;
let format = match book_format_from_str(&task.format) {
Some(f) => f,
None => {
warn!("[ANALYZER] Unknown format '{}' for book {}", task.format, task.book_id);
return;
warn!("[ANALYZER] Unknown format '{}' for book {}", task.format, book_id);
return None;
}
};
// Run blocking archive I/O on a thread pool
let book_id = task.book_id;
let path_owned = path.to_path_buf();
let pdf_scale = config.width.max(config.height);
let analyze_result = tokio::task::spawn_blocking(move || {
analyze_book(&path_owned, format, pdf_scale)
})
.await;
let path_owned = path.to_path_buf();
let analyze_result =
tokio::task::spawn_blocking(move || analyze_book(&path_owned, format, pdf_scale))
.await;
let (page_count, image_bytes) = match analyze_result {
let (page_count, raw_bytes) = match analyze_result {
Ok(Ok(result)) => result,
Ok(Err(e)) => {
warn!("[ANALYZER] analyze_book failed for book {}: {}", book_id, e);
// Mark parse_status = error in book_files
let _ = sqlx::query(
"UPDATE book_files SET parse_status = 'error', parse_error_opt = $2 WHERE book_id = $1",
)
@@ -265,66 +287,125 @@ pub async fn analyze_library_books(
.bind(e.to_string())
.execute(&pool)
.await;
return;
return None;
}
Err(e) => {
warn!("[ANALYZER] spawn_blocking error for book {}: {}", book_id, e);
return;
return None;
}
};
// Generate thumbnail
let thumb_result = tokio::task::spawn_blocking({
let config = config.clone();
move || generate_thumbnail(&image_bytes, &config)
// Save raw bytes to disk (no resize, no encode)
let raw_path = match tokio::task::spawn_blocking({
let dir = config.directory.clone();
let bytes = raw_bytes.clone();
move || save_raw_image(book_id, &bytes, &dir)
})
.await
{
Ok(Ok(p)) => p,
Ok(Err(e)) => {
warn!("[ANALYZER] save_raw_image failed for book {}: {}", book_id, e);
return None;
}
Err(e) => {
warn!("[ANALYZER] spawn_blocking save_raw error for book {}: {}", book_id, e);
return None;
}
};
// Update page_count in DB
if let Err(e) = sqlx::query("UPDATE books SET page_count = $1 WHERE id = $2")
.bind(page_count)
.bind(book_id)
.execute(&pool)
.await
{
warn!("[ANALYZER] DB page_count update failed for book {}: {}", book_id, e);
return None;
}
let processed = extracted_count.fetch_add(1, Ordering::Relaxed) + 1;
let percent = (processed as f64 / total as f64 * 50.0) as i32; // first 50%
let _ = sqlx::query(
"UPDATE index_jobs SET processed_files = $2, progress_percent = $3 WHERE id = $1",
)
.bind(job_id)
.bind(processed)
.bind(percent)
.execute(&pool)
.await;
Some((book_id, raw_path, page_count))
}
})
.buffer_unordered(concurrency)
.filter_map(|x| async move { x })
.collect()
.await;
if cancelled_flag.load(Ordering::Relaxed) {
cancel_handle.abort();
info!("[ANALYZER] Job {} cancelled during extraction phase", job_id);
return Err(anyhow::anyhow!("Job cancelled by user"));
}
let extracted_total = extracted.len() as i32;
let phase_a_elapsed = phase_a_start.elapsed();
info!(
"[ANALYZER] Sub-phase A complete: {}/{} books extracted in {:.1}s ({:.0} ms/book)",
extracted_total,
total,
phase_a_elapsed.as_secs_f64(),
if extracted_total > 0 { phase_a_elapsed.as_millis() as f64 / extracted_total as f64 } else { 0.0 }
);
// -------------------------------------------------------------------------
// Sub-phase B: resize raw images and encode as WebP
// CPU bound — can run at higher concurrency than I/O phase
// -------------------------------------------------------------------------
let phase_b_start = std::time::Instant::now();
let _ = sqlx::query(
"UPDATE index_jobs SET status = 'generating_thumbnails', generating_thumbnails_started_at = NOW(), total_files = $2, processed_files = 0, current_file = NULL WHERE id = $1",
)
.bind(job_id)
.bind(extracted_total)
.execute(&state.pool)
.await;
let resize_count = Arc::new(AtomicI32::new(0));
stream::iter(extracted)
.for_each_concurrent(concurrency, |(book_id, raw_path, page_count)| {
let pool = state.pool.clone();
let config = config.clone();
let cancelled = cancelled_flag.clone();
let resize_count = resize_count.clone();
async move {
if cancelled.load(Ordering::Relaxed) {
return;
}
let raw_path_clone = raw_path.clone();
let thumb_result = tokio::task::spawn_blocking(move || {
resize_raw_to_webp(book_id, &raw_path_clone, &config)
})
.await;
let thumb_bytes = match thumb_result {
Ok(Ok(b)) => b,
Ok(Err(e)) => {
warn!("[ANALYZER] thumbnail generation failed for book {}: {}", book_id, e);
// Still update page_count even if thumbnail fails
let _ = sqlx::query(
"UPDATE books SET page_count = $1 WHERE id = $2",
)
.bind(page_count)
.bind(book_id)
.execute(&pool)
.await;
return;
}
Err(e) => {
warn!("[ANALYZER] spawn_blocking thumbnail error for book {}: {}", book_id, e);
return;
}
};
// Save thumbnail file
let save_result = {
let config = config.clone();
tokio::task::spawn_blocking(move || save_thumbnail(book_id, &thumb_bytes, &config))
.await
};
let thumb_path = match save_result {
let thumb_path = match thumb_result {
Ok(Ok(p)) => p,
Ok(Err(e)) => {
warn!("[ANALYZER] save_thumbnail failed for book {}: {}", book_id, e);
let _ = sqlx::query("UPDATE books SET page_count = $1 WHERE id = $2")
.bind(page_count)
.bind(book_id)
.execute(&pool)
.await;
warn!("[ANALYZER] resize_raw_to_webp failed for book {}: {}", book_id, e);
// page_count is already set; thumbnail stays NULL
return;
}
Err(e) => {
warn!("[ANALYZER] spawn_blocking save error for book {}: {}", book_id, e);
warn!("[ANALYZER] spawn_blocking resize error for book {}: {}", book_id, e);
return;
}
};
// Update DB
if let Err(e) = sqlx::query(
"UPDATE books SET page_count = $1, thumbnail_path = $2 WHERE id = $3",
)
@@ -334,12 +415,13 @@ pub async fn analyze_library_books(
.execute(&pool)
.await
{
warn!("[ANALYZER] DB update failed for book {}: {}", book_id, e);
warn!("[ANALYZER] DB thumbnail update failed for book {}: {}", book_id, e);
return;
}
let processed = processed_count.fetch_add(1, Ordering::Relaxed) + 1;
let percent = (processed as f64 / total as f64 * 100.0) as i32;
let processed = resize_count.fetch_add(1, Ordering::Relaxed) + 1;
let percent =
50 + (processed as f64 / extracted_total as f64 * 50.0) as i32; // last 50%
let _ = sqlx::query(
"UPDATE index_jobs SET processed_files = $2, progress_percent = $3 WHERE id = $1",
)
@@ -355,14 +437,24 @@ pub async fn analyze_library_books(
cancel_handle.abort();
if cancelled_flag.load(Ordering::Relaxed) {
info!("[ANALYZER] Job {} cancelled by user, stopping analysis", job_id);
info!("[ANALYZER] Job {} cancelled during resize phase", job_id);
return Err(anyhow::anyhow!("Job cancelled by user"));
}
let final_count = processed_count.load(Ordering::Relaxed);
let final_count = resize_count.load(Ordering::Relaxed);
let phase_b_elapsed = phase_b_start.elapsed();
info!(
"[ANALYZER] Analysis complete: {}/{} books processed",
final_count, total
"[ANALYZER] Sub-phase B complete: {}/{} thumbnails generated in {:.1}s ({:.0} ms/book)",
final_count,
extracted_total,
phase_b_elapsed.as_secs_f64(),
if final_count > 0 { phase_b_elapsed.as_millis() as f64 / final_count as f64 } else { 0.0 }
);
info!(
"[ANALYZER] Total: {:.1}s (extraction {:.1}s + resize {:.1}s)",
(phase_a_elapsed + phase_b_elapsed).as_secs_f64(),
phase_a_elapsed.as_secs_f64(),
phase_b_elapsed.as_secs_f64(),
);
Ok(())
@@ -376,7 +468,6 @@ pub async fn regenerate_thumbnails(
) -> Result<()> {
let config = load_thumbnail_config(&state.pool).await;
// Delete thumbnail files for all books in scope
let book_ids_to_clear: Vec<Uuid> = sqlx::query_scalar(
r#"SELECT id FROM books WHERE (library_id = $1 OR $1 IS NULL) AND thumbnail_path IS NOT NULL"#,
)
@@ -387,34 +478,26 @@ pub async fn regenerate_thumbnails(
let mut deleted_count = 0usize;
for book_id in &book_ids_to_clear {
let filename = format!("{}.webp", book_id);
let thumbnail_path = Path::new(&config.directory).join(&filename);
if thumbnail_path.exists() {
if let Err(e) = std::fs::remove_file(&thumbnail_path) {
warn!(
"[ANALYZER] Failed to delete thumbnail {}: {}",
thumbnail_path.display(),
e
);
// Delete WebP thumbnail
let webp_path = Path::new(&config.directory).join(format!("{}.webp", book_id));
if webp_path.exists() {
if let Err(e) = std::fs::remove_file(&webp_path) {
warn!("[ANALYZER] Failed to delete thumbnail {}: {}", webp_path.display(), e);
} else {
deleted_count += 1;
}
}
// Delete raw file if it exists (interrupted previous run)
let raw_path = Path::new(&config.directory).join(format!("{}.raw", book_id));
let _ = std::fs::remove_file(&raw_path);
}
info!(
"[ANALYZER] Deleted {} thumbnail files for regeneration",
deleted_count
);
info!("[ANALYZER] Deleted {} thumbnail files for regeneration", deleted_count);
// Clear thumbnail_path in DB
sqlx::query(
r#"UPDATE books SET thumbnail_path = NULL WHERE (library_id = $1 OR $1 IS NULL)"#,
)
.bind(library_id)
.execute(&state.pool)
.await?;
sqlx::query(r#"UPDATE books SET thumbnail_path = NULL WHERE (library_id = $1 OR $1 IS NULL)"#)
.bind(library_id)
.execute(&state.pool)
.await?;
// Re-analyze all books (now thumbnail_path IS NULL for all)
analyze_library_books(state, job_id, library_id, true).await
}
@@ -422,16 +505,13 @@ pub async fn regenerate_thumbnails(
pub async fn cleanup_orphaned_thumbnails(state: &AppState) -> Result<()> {
let config = load_thumbnail_config(&state.pool).await;
// Load ALL book IDs across all libraries — we need the complete set to avoid
// deleting thumbnails that belong to other libraries during a per-library rebuild.
let existing_book_ids: std::collections::HashSet<Uuid> = sqlx::query_scalar(
r#"SELECT id FROM books"#,
)
.fetch_all(&state.pool)
.await
.unwrap_or_default()
.into_iter()
.collect();
let existing_book_ids: std::collections::HashSet<Uuid> =
sqlx::query_scalar(r#"SELECT id FROM books"#)
.fetch_all(&state.pool)
.await
.unwrap_or_default()
.into_iter()
.collect();
let thumbnail_dir = Path::new(&config.directory);
if !thumbnail_dir.exists() {
@@ -441,21 +521,23 @@ pub async fn cleanup_orphaned_thumbnails(state: &AppState) -> Result<()> {
let mut deleted_count = 0usize;
if let Ok(entries) = std::fs::read_dir(thumbnail_dir) {
for entry in entries.flatten() {
if let Some(file_name) = entry.file_name().to_str() {
if file_name.ends_with(".webp") {
if let Some(book_id_str) = file_name.strip_suffix(".webp") {
if let Ok(book_id) = Uuid::parse_str(book_id_str) {
if !existing_book_ids.contains(&book_id) {
if let Err(e) = std::fs::remove_file(entry.path()) {
warn!(
"Failed to delete orphaned thumbnail {}: {}",
entry.path().display(),
e
);
} else {
deleted_count += 1;
}
}
let file_name = entry.file_name();
let file_name = file_name.to_string_lossy();
// Clean up both .webp and orphaned .raw files
let stem = if let Some(s) = file_name.strip_suffix(".webp") {
Some(s.to_string())
} else if let Some(s) = file_name.strip_suffix(".raw") {
Some(s.to_string())
} else {
None
};
if let Some(book_id_str) = stem {
if let Ok(book_id) = Uuid::parse_str(&book_id_str) {
if !existing_book_ids.contains(&book_id) {
if let Err(e) = std::fs::remove_file(entry.path()) {
warn!("Failed to delete orphaned file {}: {}", entry.path().display(), e);
} else {
deleted_count += 1;
}
}
}
@@ -463,9 +545,6 @@ pub async fn cleanup_orphaned_thumbnails(state: &AppState) -> Result<()> {
}
}
info!(
"[ANALYZER] Deleted {} orphaned thumbnail files",
deleted_count
);
info!("[ANALYZER] Deleted {} orphaned thumbnail files", deleted_count);
Ok(())
}

View File

@@ -6,9 +6,10 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
natord.workspace = true
image.workspace = true
lopdf = "0.35"
natord.workspace = true
pdfium-render.workspace = true
regex = "1"
uuid.workspace = true
walkdir.workspace = true
unrar.workspace = true
zip = { version = "2.2", default-features = false, features = ["deflate"] }

View File

@@ -1,10 +1,7 @@
use anyhow::{Context, Result};
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::OnceLock;
use uuid::Uuid;
use walkdir::WalkDir;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BookFormat {
@@ -152,8 +149,7 @@ pub fn parse_metadata(
}
/// Open an archive once and return (page_count, first_page_bytes).
/// This is more efficient than calling parse_metadata + extract_first_page separately.
/// `pdf_render_scale`: max dimension (width or height) used by pdftoppm; 0 means use default (400).
/// `pdf_render_scale`: max dimension used for PDF rasterization; 0 means use default (400).
pub fn analyze_book(path: &Path, format: BookFormat, pdf_render_scale: u32) -> Result<(i32, Vec<u8>)> {
match format {
BookFormat::Cbz => analyze_cbz(path),
@@ -189,105 +185,98 @@ fn analyze_cbz(path: &Path) -> Result<(i32, Vec<u8>)> {
Ok((count, buf))
}
fn list_cbr_images(path: &Path) -> Result<Vec<String>> {
// Try unrar lb first (fast)
let output = std::process::Command::new("unrar")
.arg("lb")
.arg(path)
.output()
.with_context(|| format!("failed to execute unrar lb for {}", path.display()))?;
if output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
let mut images: Vec<String> = stdout
.lines()
.map(|l| l.trim().to_string())
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
.collect();
if !images.is_empty() {
images.sort_by(|a, b| natord::compare(a, b));
return Ok(images);
}
}
// Fallback: lsar (from unar package) handles UTF-16BE encoded filenames
let lsar_output = std::process::Command::new("lsar")
.arg(path)
.output()
.with_context(|| format!("failed to execute lsar for {}", path.display()))?;
if !lsar_output.status.success() {
return Err(anyhow::anyhow!(
"both unrar lb and lsar failed for {}",
path.display()
));
}
let stdout = String::from_utf8_lossy(&lsar_output.stdout);
// lsar output: first line is archive info, then one file per line (indented)
let mut images: Vec<String> = stdout
.lines()
.skip(1) // skip the archive header line
.map(|l| l.trim().to_string())
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
.collect();
images.sort_by(|a, b| natord::compare(a, b));
Ok(images)
}
fn analyze_cbr(path: &Path) -> Result<(i32, Vec<u8>)> {
let mut image_names = list_cbr_images(path)?;
image_names.sort();
// Pass 1: list all image names via unrar (in-process, no subprocess)
let mut image_names: Vec<String> = {
let archive = unrar::Archive::new(path)
.open_for_listing()
.map_err(|e| anyhow::anyhow!("unrar listing failed for {}: {}", path.display(), e))?;
let mut names = Vec::new();
for entry in archive {
let entry = entry.map_err(|e| anyhow::anyhow!("unrar entry error: {}", e))?;
let name = entry.filename.to_string_lossy().to_string();
if is_image_name(&name.to_ascii_lowercase()) {
names.push(name);
}
}
names
};
let count = image_names.len() as i32;
if count == 0 {
if image_names.is_empty() {
return Err(anyhow::anyhow!("no images found in cbr: {}", path.display()));
}
let first_name = &image_names[0];
image_names.sort_by(|a, b| natord::compare(a, b));
let count = image_names.len() as i32;
let first_name = image_names[0].clone();
// Try unrar p to extract first image to stdout (faster — no temp dir)
let p_output = std::process::Command::new("unrar")
.args(["p", "-inul"])
.arg(path)
.arg(first_name)
.output();
// Pass 2: extract first image to memory
let mut archive = unrar::Archive::new(path)
.open_for_processing()
.map_err(|e| anyhow::anyhow!("unrar open for processing failed for {}: {}", path.display(), e))?;
match p_output {
Ok(out) if out.status.success() && looks_like_image(&out.stdout) => Ok((count, out.stdout)),
_ => {
// Fallback: targeted extraction with unar (handles special chars, encoding issues)
let image_bytes = extract_cbr_first_page(path, first_name)?;
Ok((count, image_bytes))
while let Some(header) = archive
.read_header()
.map_err(|e| anyhow::anyhow!("unrar read header: {}", e))?
{
let entry_name = header.entry().filename.to_string_lossy().to_string();
if entry_name == first_name {
let (data, _) = header
.read()
.map_err(|e| anyhow::anyhow!("unrar read data: {}", e))?;
return Ok((count, data));
}
archive = header
.skip()
.map_err(|e| anyhow::anyhow!("unrar skip: {}", e))?;
}
}
/// Check image magic bytes to validate that bytes are a real image before decoding.
fn looks_like_image(bytes: &[u8]) -> bool {
if bytes.len() < 12 {
return false;
}
// JPEG: FF D8 FF
if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
return true;
}
// PNG: 89 50 4E 47 0D 0A 1A 0A
if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
return true;
}
// WebP: RIFF....WEBP
if &bytes[0..4] == b"RIFF" && &bytes[8..12] == b"WEBP" {
return true;
}
false
Err(anyhow::anyhow!(
"could not find '{}' in {}",
first_name,
path.display()
))
}
fn analyze_pdf(path: &Path, pdf_render_scale: u32) -> Result<(i32, Vec<u8>)> {
let count = parse_pdf_page_count(path)?;
let image_bytes = extract_pdf_first_page(path, pdf_render_scale)?;
Ok((count, image_bytes))
use pdfium_render::prelude::*;
// Open PDF once — get page count and render first page in a single pass
let pdfium = Pdfium::new(
Pdfium::bind_to_system_library()
.map_err(|e| anyhow::anyhow!("pdfium library not available: {:?}", e))?,
);
let document = pdfium
.load_pdf_from_file(path, None)
.map_err(|e| anyhow::anyhow!("pdfium load failed for {}: {:?}", path.display(), e))?;
let count = document.pages().len() as i32;
if count == 0 {
return Err(anyhow::anyhow!("PDF has no pages: {}", path.display()));
}
let scale = if pdf_render_scale == 0 { 400 } else { pdf_render_scale } as i32;
let config = PdfRenderConfig::new()
.set_target_width(scale)
.set_maximum_height(scale);
let page = document
.pages()
.get(0)
.map_err(|e| anyhow::anyhow!("cannot get first page of {}: {:?}", path.display(), e))?;
let bitmap = page
.render_with_config(&config)
.map_err(|e| anyhow::anyhow!("pdfium render failed for {}: {:?}", path.display(), e))?;
let image = bitmap.as_image();
let mut buf = std::io::Cursor::new(Vec::new());
image
.write_to(&mut buf, image::ImageFormat::Png)
.context("failed to encode rendered PDF page as PNG")?;
Ok((count, buf.into_inner()))
}
fn parse_cbz_page_count(path: &Path) -> Result<i32> {
@@ -306,34 +295,23 @@ fn parse_cbz_page_count(path: &Path) -> Result<i32> {
}
fn parse_cbr_page_count(path: &Path) -> Result<i32> {
let images = list_cbr_images(path)?;
Ok(images.len() as i32)
let archive = unrar::Archive::new(path)
.open_for_listing()
.map_err(|e| anyhow::anyhow!("unrar listing failed for {}: {}", path.display(), e))?;
let count = archive
.filter(|r| {
r.as_ref()
.map(|e| is_image_name(&e.filename.to_string_lossy().to_ascii_lowercase()))
.unwrap_or(false)
})
.count() as i32;
Ok(count)
}
fn parse_pdf_page_count(path: &Path) -> Result<i32> {
let output = std::process::Command::new("pdfinfo")
.arg(path)
.output()
.with_context(|| format!("failed to execute pdfinfo for {}", path.display()))?;
if !output.status.success() {
return Err(anyhow::anyhow!("pdfinfo failed for {}", path.display()));
}
let stdout = String::from_utf8_lossy(&output.stdout);
for line in stdout.lines() {
if line.starts_with("Pages:") {
if let Some(pages_str) = line.split_whitespace().nth(1) {
return pages_str
.parse::<i32>()
.with_context(|| format!("cannot parse page count: {}", pages_str));
}
}
}
Err(anyhow::anyhow!(
"could not find page count in pdfinfo output"
))
let doc = lopdf::Document::load(path)
.with_context(|| format!("cannot open pdf: {}", path.display()))?;
Ok(doc.get_pages().len() as i32)
}
fn is_image_name(name: &str) -> bool {
@@ -351,13 +329,8 @@ fn is_image_name(name: &str) -> bool {
pub fn extract_first_page(path: &Path, format: BookFormat) -> Result<Vec<u8>> {
match format {
BookFormat::Cbz => extract_cbz_first_page(path),
BookFormat::Cbr => {
let mut image_names = list_cbr_images(path)?;
image_names.sort();
let first_name = image_names.into_iter().next().context("no images found in cbr")?;
extract_cbr_first_page(path, &first_name)
}
BookFormat::Pdf => extract_pdf_first_page(path, 0),
BookFormat::Cbr => analyze_cbr(path).map(|(_, bytes)| bytes),
BookFormat::Pdf => analyze_pdf(path, 0).map(|(_, bytes)| bytes),
}
}
@@ -386,98 +359,13 @@ fn extract_cbz_first_page(path: &Path) -> Result<Vec<u8>> {
Ok(buf)
}
fn extract_cbr_first_page(path: &Path, _first_name: &str) -> Result<Vec<u8>> {
let work_dir = std::env::temp_dir().join(format!("stripstream-cbr-thumb-{}", Uuid::new_v4()));
let extract_dir = work_dir.join("out");
std::fs::create_dir_all(&extract_dir).context("cannot create temp dir")?;
// unar constructs internal regexes from (archive_path + "/" + internal_path).
// Archive filenames containing regex special chars like `[`, `]`, `(`, `)` cause
// XADRegexException. Work around by giving unar a safe symlink name.
let safe_path = work_dir.join("archive.cbr");
if std::os::unix::fs::symlink(path, &safe_path).is_err() {
// Cross-filesystem fallback: copy (slower but safe)
std::fs::copy(path, &safe_path).context("cannot copy cbr to temp dir")?;
}
let output = std::process::Command::new("env")
.args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"])
.arg(&extract_dir)
.arg(&safe_path)
.output()
.context("unar failed")?;
if !output.status.success() {
let _ = std::fs::remove_dir_all(&work_dir);
return Err(anyhow::anyhow!(
"unar extract failed: {:?}",
String::from_utf8_lossy(&output.stderr)
));
}
let mut image_files: Vec<_> = WalkDir::new(&extract_dir)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| {
let name = e.file_name().to_string_lossy().to_lowercase();
is_image_name(&name)
})
.collect();
image_files.sort_by(|a, b| natord::compare(&a.path().to_string_lossy(), &b.path().to_string_lossy()));
let first_image = image_files.first().context("no images found in cbr")?;
let data = std::fs::read(first_image.path())?;
let _ = std::fs::remove_dir_all(&work_dir);
Ok(data)
}
fn extract_pdf_first_page(path: &Path, pdf_render_scale: u32) -> Result<Vec<u8>> {
let tmp_dir = std::env::temp_dir().join(format!("stripstream-pdf-thumb-{}", Uuid::new_v4()));
std::fs::create_dir_all(&tmp_dir)?;
let output_prefix = tmp_dir.join("page");
let scale = if pdf_render_scale == 0 { 400 } else { pdf_render_scale };
let scale_str = scale.to_string();
let output = Command::new("pdftoppm")
.args([
"-f",
"1",
"-singlefile",
"-png",
"-scale-to",
&scale_str,
path.to_str().unwrap(),
output_prefix.to_str().unwrap(),
])
.output()
.context("pdftoppm failed")?;
if !output.status.success() {
let _ = std::fs::remove_dir_all(&tmp_dir);
return Err(anyhow::anyhow!("pdftoppm failed"));
}
let image_path = output_prefix.with_extension("png");
let data = std::fs::read(&image_path)?;
let _ = std::fs::remove_dir_all(&tmp_dir);
Ok(data)
}
/// Convert a CBR file to CBZ in-place (same directory, same stem).
///
/// The conversion is safe: a `.cbz.tmp` file is written first, verified, then
/// atomically renamed to `.cbz`. The original CBR is **not** deleted by this
/// function — the caller is responsible for removing it after a successful DB
/// update.
/// function — the caller is responsible for removing it after a successful DB update.
///
/// Returns the path of the newly created `.cbz` file.
///
/// # Errors
/// - Returns an error if a `.cbz` file with the same stem already exists.
/// - Returns an error if extraction, packing, or verification fails.
/// - Returns an error if `cbr_path` has no parent directory or no file stem.
pub fn convert_cbr_to_cbz(cbr_path: &Path) -> Result<PathBuf> {
let parent = cbr_path
.parent()
@@ -489,7 +377,6 @@ pub fn convert_cbr_to_cbz(cbr_path: &Path) -> Result<PathBuf> {
let cbz_path = parent.join(format!("{}.cbz", stem.to_string_lossy()));
let tmp_path = parent.join(format!("{}.cbz.tmp", stem.to_string_lossy()));
// Refuse if target CBZ already exists
if cbz_path.exists() {
return Err(anyhow::anyhow!(
"CBZ file already exists: {}",
@@ -497,46 +384,45 @@ pub fn convert_cbr_to_cbz(cbr_path: &Path) -> Result<PathBuf> {
));
}
// Extract CBR to a temp dir
let tmp_dir =
std::env::temp_dir().join(format!("stripstream-cbr-convert-{}", Uuid::new_v4()));
std::fs::create_dir_all(&tmp_dir).context("cannot create temp dir")?;
// Extract all images from CBR into memory using unrar crate (no subprocess)
let mut images: Vec<(String, Vec<u8>)> = Vec::new();
let mut archive = unrar::Archive::new(cbr_path)
.open_for_processing()
.map_err(|e| anyhow::anyhow!("unrar open failed for {}: {}", cbr_path.display(), e))?;
let output = std::process::Command::new("env")
.args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"])
.arg(&tmp_dir)
.arg(cbr_path)
.output()
.context("unar failed to start")?;
while let Some(header) = archive
.read_header()
.map_err(|e| anyhow::anyhow!("unrar read header: {}", e))?
{
let entry_name = header.entry().filename.to_string_lossy().to_string();
let file_name = Path::new(&entry_name)
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| entry_name.clone());
if !output.status.success() {
let _ = std::fs::remove_dir_all(&tmp_dir);
return Err(anyhow::anyhow!(
"unar extraction failed: {}",
String::from_utf8_lossy(&output.stderr)
));
if is_image_name(&entry_name.to_ascii_lowercase()) {
let (data, next) = header
.read()
.map_err(|e| anyhow::anyhow!("unrar read: {}", e))?;
images.push((file_name, data));
archive = next;
} else {
archive = header
.skip()
.map_err(|e| anyhow::anyhow!("unrar skip: {}", e))?;
}
}
// Collect and sort image files
let mut image_files: Vec<_> = WalkDir::new(&tmp_dir)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| {
let name = e.file_name().to_string_lossy().to_lowercase();
is_image_name(&name)
})
.collect();
image_files.sort_by(|a, b| natord::compare(&a.path().to_string_lossy(), &b.path().to_string_lossy()));
let image_count = image_files.len();
if image_count == 0 {
let _ = std::fs::remove_dir_all(&tmp_dir);
if images.is_empty() {
return Err(anyhow::anyhow!(
"no images found in CBR: {}",
cbr_path.display()
));
}
images.sort_by(|(a, _), (b, _)| natord::compare(a, b));
let image_count = images.len();
// Pack images into the .cbz.tmp file
let pack_result = (|| -> Result<()> {
let cbz_file = std::fs::File::create(&tmp_path)
@@ -545,21 +431,16 @@ pub fn convert_cbr_to_cbz(cbr_path: &Path) -> Result<PathBuf> {
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Deflated);
for entry in &image_files {
let file_name = entry.file_name().to_string_lossy().to_string();
zip.start_file(&file_name, options)
for (file_name, data) in &images {
zip.start_file(file_name, options)
.with_context(|| format!("cannot add file {} to zip", file_name))?;
let data = std::fs::read(entry.path())
.with_context(|| format!("cannot read {}", entry.path().display()))?;
zip.write_all(&data)
zip.write_all(data)
.with_context(|| format!("cannot write {} to zip", file_name))?;
}
zip.finish().context("cannot finalize zip")?;
Ok(())
})();
let _ = std::fs::remove_dir_all(&tmp_dir);
if let Err(err) = pack_result {
let _ = std::fs::remove_file(&tmp_path);
return Err(err);
@@ -593,7 +474,6 @@ pub fn convert_cbr_to_cbz(cbr_path: &Path) -> Result<PathBuf> {
return Err(err);
}
// Atomic rename .cbz.tmp → .cbz
std::fs::rename(&tmp_path, &cbz_path)
.with_context(|| format!("cannot rename {} to {}", tmp_path.display(), cbz_path.display()))?;

View File

@@ -0,0 +1,7 @@
-- Migration: Add status 'extracting_pages' for the first sub-phase of thumbnail generation
-- Phase 1 (extracting_pages): extract raw first-page image from archive, store as-is
-- Phase 2 (generating_thumbnails): resize and encode as WebP
ALTER TABLE index_jobs
DROP CONSTRAINT IF EXISTS index_jobs_status_check,
ADD CONSTRAINT index_jobs_status_check
CHECK (status IN ('pending', 'running', 'extracting_pages', 'generating_thumbnails', 'success', 'failed', 'cancelled'));

View File

@@ -0,0 +1,5 @@
-- Add timestamp for Phase 2b (generating_thumbnails) so we can show separate durations:
-- Phase 2a: phase2_started_at → generating_thumbnails_started_at (extracting_pages)
-- Phase 2b: generating_thumbnails_started_at → finished_at
ALTER TABLE index_jobs
ADD COLUMN IF NOT EXISTS generating_thumbnails_started_at TIMESTAMPTZ;