feat: two-phase indexation with direct thumbnail generation in indexer

Phase 1 (discovery): walkdir + filename-only metadata, zero archive I/O.
Books are visible immediately in the UI while Phase 2 runs in background.

Phase 2 (analysis): open each archive once via analyze_book() to extract
page_count and first page bytes, then generate WebP thumbnail directly in
the indexer — removing the HTTP roundtrip to the API checkup endpoint.

- Add parse_metadata_fast() (infallible, no archive I/O)
- Add analyze_book() returning (page_count, first_page_bytes) in one pass
- Add looks_like_image() magic bytes check for unrar p stdout validation
- Add lsar fallback in list_cbr_images() for UTF-16BE encoded filenames
- Add directory_mtimes table to skip unchanged dirs on incremental scans
- Add analyzer.rs: generate_thumbnail, analyze_library_books, regenerate_thumbnails
- Remove run_checkup() from API; indexer handles thumbnail jobs directly
- Remove api_base_url/api_bootstrap_token from IndexerConfig and AppState
- Add unar + poppler-utils to indexer Dockerfile
- Fix smoke.sh: wait for job completion, check thumbnail_url field

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-09 22:13:05 +01:00
parent 36af34443e
commit cfc896e92f
22 changed files with 1274 additions and 768 deletions

View File

@@ -0,0 +1,442 @@
use anyhow::Result;
use futures::stream::{self, StreamExt};
use image::GenericImageView;
use parsers::{analyze_book, BookFormat};
use sqlx::Row;
use std::path::Path;
use std::sync::atomic::{AtomicI32, Ordering};
use std::sync::Arc;
use tracing::{info, warn};
use uuid::Uuid;
use crate::{utils, AppState};
#[derive(Clone)]
struct ThumbnailConfig {
enabled: bool,
width: u32,
height: u32,
quality: u8,
directory: String,
}
async fn load_thumbnail_config(pool: &sqlx::PgPool) -> ThumbnailConfig {
let fallback = ThumbnailConfig {
enabled: true,
width: 300,
height: 400,
quality: 80,
directory: "/data/thumbnails".to_string(),
};
let row = sqlx::query(r#"SELECT value FROM app_settings WHERE key = 'thumbnail'"#)
.fetch_optional(pool)
.await;
match row {
Ok(Some(row)) => {
let value: serde_json::Value = row.get("value");
ThumbnailConfig {
enabled: value
.get("enabled")
.and_then(|v| v.as_bool())
.unwrap_or(fallback.enabled),
width: value
.get("width")
.and_then(|v| v.as_u64())
.map(|v| v as u32)
.unwrap_or(fallback.width),
height: value
.get("height")
.and_then(|v| v.as_u64())
.map(|v| v as u32)
.unwrap_or(fallback.height),
quality: value
.get("quality")
.and_then(|v| v.as_u64())
.map(|v| v as u8)
.unwrap_or(fallback.quality),
directory: value
.get("directory")
.and_then(|v| v.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| fallback.directory.clone()),
}
}
_ => fallback,
}
}
async fn load_thumbnail_concurrency(pool: &sqlx::PgPool) -> usize {
let default_concurrency = 4;
let row = sqlx::query(r#"SELECT value FROM app_settings WHERE key = 'limits'"#)
.fetch_optional(pool)
.await;
match row {
Ok(Some(row)) => {
let value: serde_json::Value = row.get("value");
value
.get("concurrent_renders")
.and_then(|v| v.as_u64())
.map(|v| v as usize)
.unwrap_or(default_concurrency)
}
_ => default_concurrency,
}
}
fn generate_thumbnail(image_bytes: &[u8], config: &ThumbnailConfig) -> anyhow::Result<Vec<u8>> {
let img = image::load_from_memory(image_bytes)
.map_err(|e| anyhow::anyhow!("failed to load image: {}", e))?;
let (orig_w, orig_h) = img.dimensions();
let ratio_w = config.width as f32 / orig_w as f32;
let ratio_h = config.height as f32 / orig_h as f32;
let ratio = ratio_w.min(ratio_h);
let new_w = (orig_w as f32 * ratio) as u32;
let new_h = (orig_h as f32 * ratio) as u32;
let resized = img.resize(new_w, new_h, image::imageops::FilterType::Lanczos3);
let rgba = resized.to_rgba8();
let (w, h) = rgba.dimensions();
let rgb_data: Vec<u8> = rgba.pixels().flat_map(|p| [p[0], p[1], p[2]]).collect();
let quality = f32::max(config.quality as f32, 85.0);
let webp_data = webp::Encoder::new(&rgb_data, webp::PixelLayout::Rgb, w, h).encode(quality);
Ok(webp_data.to_vec())
}
fn save_thumbnail(
book_id: Uuid,
thumbnail_bytes: &[u8],
config: &ThumbnailConfig,
) -> anyhow::Result<String> {
let dir = Path::new(&config.directory);
std::fs::create_dir_all(dir)?;
let filename = format!("{}.webp", book_id);
let path = dir.join(&filename);
std::fs::write(&path, thumbnail_bytes)?;
Ok(path.to_string_lossy().to_string())
}
fn book_format_from_str(s: &str) -> Option<BookFormat> {
match s {
"cbz" => Some(BookFormat::Cbz),
"cbr" => Some(BookFormat::Cbr),
"pdf" => Some(BookFormat::Pdf),
_ => None,
}
}
/// Phase 2 — Analysis: open each unanalyzed archive once, extract page_count + thumbnail.
/// `thumbnail_only` = true: only process books missing thumbnail (page_count may already be set).
/// `thumbnail_only` = false: process books missing page_count.
pub async fn analyze_library_books(
state: &AppState,
job_id: Uuid,
library_id: Option<Uuid>,
thumbnail_only: bool,
) -> Result<()> {
let config = load_thumbnail_config(&state.pool).await;
if !config.enabled {
info!("[ANALYZER] Thumbnails disabled, skipping analysis phase");
return Ok(());
}
let concurrency = load_thumbnail_concurrency(&state.pool).await;
// Query books that need analysis
let query_filter = if thumbnail_only {
"b.thumbnail_path IS NULL"
} else {
"b.page_count IS NULL"
};
let sql = format!(
r#"
SELECT b.id AS book_id, bf.abs_path, bf.format
FROM books b
JOIN book_files bf ON bf.book_id = b.id
WHERE (b.library_id = $1 OR $1 IS NULL)
AND {}
"#,
query_filter
);
let rows = sqlx::query(&sql)
.bind(library_id)
.fetch_all(&state.pool)
.await?;
if rows.is_empty() {
info!("[ANALYZER] No books to analyze");
return Ok(());
}
let total = rows.len() as i32;
info!(
"[ANALYZER] Analyzing {} books (thumbnail_only={}, concurrency={})",
total, thumbnail_only, concurrency
);
// Update job status
let _ = sqlx::query(
"UPDATE index_jobs SET status = 'generating_thumbnails', total_files = $2, processed_files = 0, current_file = NULL WHERE id = $1",
)
.bind(job_id)
.bind(total)
.execute(&state.pool)
.await;
let processed_count = Arc::new(AtomicI32::new(0));
struct BookTask {
book_id: Uuid,
abs_path: String,
format: String,
}
let tasks: Vec<BookTask> = rows
.into_iter()
.map(|row| BookTask {
book_id: row.get("book_id"),
abs_path: row.get("abs_path"),
format: row.get("format"),
})
.collect();
stream::iter(tasks)
.for_each_concurrent(concurrency, |task| {
let processed_count = processed_count.clone();
let pool = state.pool.clone();
let config = config.clone();
async move {
let local_path = utils::remap_libraries_path(&task.abs_path);
let path = Path::new(&local_path);
let format = match book_format_from_str(&task.format) {
Some(f) => f,
None => {
warn!("[ANALYZER] Unknown format '{}' for book {}", task.format, task.book_id);
return;
}
};
// Run blocking archive I/O on a thread pool
let book_id = task.book_id;
let path_owned = path.to_path_buf();
let analyze_result = tokio::task::spawn_blocking(move || {
analyze_book(&path_owned, format)
})
.await;
let (page_count, image_bytes) = match analyze_result {
Ok(Ok(result)) => result,
Ok(Err(e)) => {
warn!("[ANALYZER] analyze_book failed for book {}: {}", book_id, e);
// Mark parse_status = error in book_files
let _ = sqlx::query(
"UPDATE book_files SET parse_status = 'error', parse_error_opt = $2 WHERE book_id = $1",
)
.bind(book_id)
.bind(e.to_string())
.execute(&pool)
.await;
return;
}
Err(e) => {
warn!("[ANALYZER] spawn_blocking error for book {}: {}", book_id, e);
return;
}
};
// Generate thumbnail
let thumb_result = tokio::task::spawn_blocking({
let config = config.clone();
move || generate_thumbnail(&image_bytes, &config)
})
.await;
let thumb_bytes = match thumb_result {
Ok(Ok(b)) => b,
Ok(Err(e)) => {
warn!("[ANALYZER] thumbnail generation failed for book {}: {}", book_id, e);
// Still update page_count even if thumbnail fails
let _ = sqlx::query(
"UPDATE books SET page_count = $1 WHERE id = $2",
)
.bind(page_count)
.bind(book_id)
.execute(&pool)
.await;
return;
}
Err(e) => {
warn!("[ANALYZER] spawn_blocking thumbnail error for book {}: {}", book_id, e);
return;
}
};
// Save thumbnail file
let save_result = {
let config = config.clone();
tokio::task::spawn_blocking(move || save_thumbnail(book_id, &thumb_bytes, &config))
.await
};
let thumb_path = match save_result {
Ok(Ok(p)) => p,
Ok(Err(e)) => {
warn!("[ANALYZER] save_thumbnail failed for book {}: {}", book_id, e);
let _ = sqlx::query("UPDATE books SET page_count = $1 WHERE id = $2")
.bind(page_count)
.bind(book_id)
.execute(&pool)
.await;
return;
}
Err(e) => {
warn!("[ANALYZER] spawn_blocking save error for book {}: {}", book_id, e);
return;
}
};
// Update DB
if let Err(e) = sqlx::query(
"UPDATE books SET page_count = $1, thumbnail_path = $2 WHERE id = $3",
)
.bind(page_count)
.bind(&thumb_path)
.bind(book_id)
.execute(&pool)
.await
{
warn!("[ANALYZER] DB update failed for book {}: {}", book_id, e);
return;
}
let processed = processed_count.fetch_add(1, Ordering::Relaxed) + 1;
let percent = (processed as f64 / total as f64 * 100.0) as i32;
let _ = sqlx::query(
"UPDATE index_jobs SET processed_files = $2, progress_percent = $3 WHERE id = $1",
)
.bind(job_id)
.bind(processed)
.bind(percent)
.execute(&pool)
.await;
}
})
.await;
let final_count = processed_count.load(Ordering::Relaxed);
info!(
"[ANALYZER] Analysis complete: {}/{} books processed",
final_count, total
);
Ok(())
}
/// Clear thumbnail files and DB references for books in scope, then re-analyze.
pub async fn regenerate_thumbnails(
state: &AppState,
job_id: Uuid,
library_id: Option<Uuid>,
) -> Result<()> {
let config = load_thumbnail_config(&state.pool).await;
// Delete thumbnail files for all books in scope
let book_ids_to_clear: Vec<Uuid> = sqlx::query_scalar(
r#"SELECT id FROM books WHERE (library_id = $1 OR $1 IS NULL) AND thumbnail_path IS NOT NULL"#,
)
.bind(library_id)
.fetch_all(&state.pool)
.await
.unwrap_or_default();
let mut deleted_count = 0usize;
for book_id in &book_ids_to_clear {
let filename = format!("{}.webp", book_id);
let thumbnail_path = Path::new(&config.directory).join(&filename);
if thumbnail_path.exists() {
if let Err(e) = std::fs::remove_file(&thumbnail_path) {
warn!(
"[ANALYZER] Failed to delete thumbnail {}: {}",
thumbnail_path.display(),
e
);
} else {
deleted_count += 1;
}
}
}
info!(
"[ANALYZER] Deleted {} thumbnail files for regeneration",
deleted_count
);
// Clear thumbnail_path in DB
sqlx::query(
r#"UPDATE books SET thumbnail_path = NULL WHERE (library_id = $1 OR $1 IS NULL)"#,
)
.bind(library_id)
.execute(&state.pool)
.await?;
// Re-analyze all books (now thumbnail_path IS NULL for all)
analyze_library_books(state, job_id, library_id, true).await
}
/// Delete orphaned thumbnail files (books deleted in full_rebuild get new UUIDs).
pub async fn cleanup_orphaned_thumbnails(
state: &AppState,
library_id: Option<Uuid>,
) -> Result<()> {
let config = load_thumbnail_config(&state.pool).await;
let existing_book_ids: std::collections::HashSet<Uuid> = sqlx::query_scalar(
r#"SELECT id FROM books WHERE (library_id = $1 OR $1 IS NULL)"#,
)
.bind(library_id)
.fetch_all(&state.pool)
.await
.unwrap_or_default()
.into_iter()
.collect();
let thumbnail_dir = Path::new(&config.directory);
if !thumbnail_dir.exists() {
return Ok(());
}
let mut deleted_count = 0usize;
if let Ok(entries) = std::fs::read_dir(thumbnail_dir) {
for entry in entries.flatten() {
if let Some(file_name) = entry.file_name().to_str() {
if file_name.ends_with(".webp") {
if let Some(book_id_str) = file_name.strip_suffix(".webp") {
if let Ok(book_id) = Uuid::parse_str(book_id_str) {
if !existing_book_ids.contains(&book_id) {
if let Err(e) = std::fs::remove_file(entry.path()) {
warn!(
"Failed to delete orphaned thumbnail {}: {}",
entry.path().display(),
e
);
} else {
deleted_count += 1;
}
}
}
}
}
}
}
}
info!(
"[ANALYZER] Deleted {} orphaned thumbnail files",
deleted_count
);
Ok(())
}