feat: two-phase indexation with direct thumbnail generation in indexer

Phase 1 (discovery): walkdir + filename-only metadata, zero archive I/O.
Books are visible immediately in the UI while Phase 2 runs in background.

Phase 2 (analysis): open each archive once via analyze_book() to extract
page_count and first page bytes, then generate WebP thumbnail directly in
the indexer — removing the HTTP roundtrip to the API checkup endpoint.

- Add parse_metadata_fast() (infallible, no archive I/O)
- Add analyze_book() returning (page_count, first_page_bytes) in one pass
- Add looks_like_image() magic bytes check for unrar p stdout validation
- Add lsar fallback in list_cbr_images() for UTF-16BE encoded filenames
- Add directory_mtimes table to skip unchanged dirs on incremental scans
- Add analyzer.rs: generate_thumbnail, analyze_library_books, regenerate_thumbnails
- Remove run_checkup() from API; indexer handles thumbnail jobs directly
- Remove api_base_url/api_bootstrap_token from IndexerConfig and AppState
- Add unar + poppler-utils to indexer Dockerfile
- Fix smoke.sh: wait for job completion, check thumbnail_url field

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-09 22:13:05 +01:00
parent 36af34443e
commit cfc896e92f
22 changed files with 1274 additions and 768 deletions

View File

@@ -31,5 +31,5 @@ uuid.workspace = true
zip = { version = "2.2", default-features = false, features = ["deflate"] }
utoipa.workspace = true
utoipa-swagger-ui = { workspace = true, features = ["axum"] }
webp = "0.3"
webp.workspace = true
walkdir = "2"

View File

@@ -247,7 +247,7 @@ pub async fn list_folders(
}
let mut folders = Vec::new();
let depth = if params.get("path").is_some() {
let depth = if params.contains_key("path") {
canonical_target.strip_prefix(&canonical_base)
.map(|p| p.components().count())
.unwrap_or(0)

View File

@@ -76,7 +76,6 @@ async fn main() -> anyhow::Result<()> {
.route("/index/jobs/active", get(index_jobs::get_active_jobs))
.route("/index/jobs/:id", get(index_jobs::get_job_details))
.route("/index/jobs/:id/stream", get(index_jobs::stream_job_progress))
.route("/index/jobs/:id/thumbnails/checkup", axum::routing::post(thumbnails::start_checkup))
.route("/index/jobs/:id/errors", get(index_jobs::get_job_errors))
.route("/index/cancel/:id", axum::routing::post(index_jobs::cancel_job))
.route("/folders", get(index_jobs::list_folders))

View File

@@ -550,12 +550,12 @@ fn transcode_image(input: &[u8], out_format: &OutputFormat, quality: u8, width:
}
fn format_matches(source: &ImageFormat, target: &OutputFormat) -> bool {
match (source, target) {
(ImageFormat::Jpeg, OutputFormat::Jpeg) => true,
(ImageFormat::Png, OutputFormat::Png) => true,
(ImageFormat::WebP, OutputFormat::Webp) => true,
_ => false,
}
matches!(
(source, target),
(ImageFormat::Jpeg, OutputFormat::Jpeg)
| (ImageFormat::Png, OutputFormat::Png)
| (ImageFormat::WebP, OutputFormat::Webp)
)
}
fn is_image_name(name: &str) -> bool {

View File

@@ -1,310 +1,12 @@
use std::path::Path;
use std::sync::atomic::{AtomicI32, Ordering};
use std::sync::Arc;
use anyhow::Context;
use axum::{
extract::{Path as AxumPath, State},
http::StatusCode,
extract::State,
Json,
};
use futures::stream::{self, StreamExt};
use image::GenericImageView;
use serde::Deserialize;
use sqlx::Row;
use tracing::{info, warn};
use uuid::Uuid;
use utoipa::ToSchema;
use crate::{error::ApiError, index_jobs, pages, state::AppState};
#[derive(Clone)]
struct ThumbnailConfig {
enabled: bool,
width: u32,
height: u32,
quality: u8,
directory: String,
}
async fn load_thumbnail_concurrency(pool: &sqlx::PgPool) -> usize {
let default_concurrency = 4;
let row = sqlx::query(r#"SELECT value FROM app_settings WHERE key = 'limits'"#)
.fetch_optional(pool)
.await;
match row {
Ok(Some(row)) => {
let value: serde_json::Value = row.get("value");
value
.get("concurrent_renders")
.and_then(|v| v.as_u64())
.map(|v| v as usize)
.unwrap_or(default_concurrency)
}
_ => default_concurrency,
}
}
async fn load_thumbnail_config(pool: &sqlx::PgPool) -> ThumbnailConfig {
let fallback = ThumbnailConfig {
enabled: true,
width: 300,
height: 400,
quality: 80,
directory: "/data/thumbnails".to_string(),
};
let row = sqlx::query(r#"SELECT value FROM app_settings WHERE key = 'thumbnail'"#)
.fetch_optional(pool)
.await;
match row {
Ok(Some(row)) => {
let value: serde_json::Value = row.get("value");
ThumbnailConfig {
enabled: value
.get("enabled")
.and_then(|v| v.as_bool())
.unwrap_or(fallback.enabled),
width: value
.get("width")
.and_then(|v| v.as_u64())
.map(|v| v as u32)
.unwrap_or(fallback.width),
height: value
.get("height")
.and_then(|v| v.as_u64())
.map(|v| v as u32)
.unwrap_or(fallback.height),
quality: value
.get("quality")
.and_then(|v| v.as_u64())
.map(|v| v as u8)
.unwrap_or(fallback.quality),
directory: value
.get("directory")
.and_then(|v| v.as_str())
.map(|s| s.to_string())
.unwrap_or_else(|| fallback.directory.clone()),
}
}
_ => fallback,
}
}
fn generate_thumbnail(image_bytes: &[u8], config: &ThumbnailConfig) -> anyhow::Result<Vec<u8>> {
let img = image::load_from_memory(image_bytes).context("failed to load image")?;
let (orig_w, orig_h) = img.dimensions();
let ratio_w = config.width as f32 / orig_w as f32;
let ratio_h = config.height as f32 / orig_h as f32;
let ratio = ratio_w.min(ratio_h);
let new_w = (orig_w as f32 * ratio) as u32;
let new_h = (orig_h as f32 * ratio) as u32;
let resized = img.resize(new_w, new_h, image::imageops::FilterType::Lanczos3);
let rgba = resized.to_rgba8();
let (w, h) = rgba.dimensions();
let rgb_data: Vec<u8> = rgba.pixels().flat_map(|p| [p[0], p[1], p[2]]).collect();
let quality = f32::max(config.quality as f32, 85.0);
let webp_data =
webp::Encoder::new(&rgb_data, webp::PixelLayout::Rgb, w, h).encode(quality);
Ok(webp_data.to_vec())
}
fn save_thumbnail(book_id: Uuid, thumbnail_bytes: &[u8], config: &ThumbnailConfig) -> anyhow::Result<String> {
let dir = Path::new(&config.directory);
std::fs::create_dir_all(dir)?;
let filename = format!("{}.webp", book_id);
let path = dir.join(&filename);
std::fs::write(&path, thumbnail_bytes)?;
Ok(path.to_string_lossy().to_string())
}
async fn run_checkup(state: AppState, job_id: Uuid) {
let pool = &state.pool;
let row = sqlx::query("SELECT library_id, type FROM index_jobs WHERE id = $1")
.bind(job_id)
.fetch_optional(pool)
.await;
let (library_id, job_type) = match row {
Ok(Some(r)) => (
r.get::<Option<Uuid>, _>("library_id"),
r.get::<String, _>("type"),
),
_ => {
warn!("thumbnails checkup: job {} not found", job_id);
return;
}
};
// Regenerate or full_rebuild: clear existing thumbnails in scope so they get regenerated
if job_type == "thumbnail_regenerate" || job_type == "full_rebuild" {
let config = load_thumbnail_config(pool).await;
if job_type == "full_rebuild" {
// For full_rebuild: delete orphaned thumbnail files (books were deleted, new ones have new UUIDs)
// Get all existing book IDs to keep their thumbnails
let existing_book_ids: std::collections::HashSet<Uuid> = sqlx::query_scalar(
r#"SELECT id FROM books WHERE (library_id = $1 OR $1 IS NULL)"#,
)
.bind(library_id)
.fetch_all(pool)
.await
.unwrap_or_default()
.into_iter()
.collect();
// Delete thumbnail files that don't correspond to existing books
let thumbnail_dir = Path::new(&config.directory);
if thumbnail_dir.exists() {
let mut deleted_count = 0;
if let Ok(entries) = std::fs::read_dir(thumbnail_dir) {
for entry in entries.flatten() {
if let Some(file_name) = entry.file_name().to_str() {
if file_name.ends_with(".webp") {
if let Some(book_id_str) = file_name.strip_suffix(".webp") {
if let Ok(book_id) = Uuid::parse_str(book_id_str) {
if !existing_book_ids.contains(&book_id) {
if let Err(e) = std::fs::remove_file(entry.path()) {
warn!("Failed to delete orphaned thumbnail {}: {}", entry.path().display(), e);
} else {
deleted_count += 1;
}
}
}
}
}
}
}
}
info!("thumbnails full_rebuild: deleted {} orphaned thumbnail files", deleted_count);
}
} else {
// For regenerate: delete thumbnail files for books with thumbnails
let book_ids_to_clear: Vec<Uuid> = sqlx::query_scalar(
r#"SELECT id FROM books WHERE (library_id = $1 OR $1 IS NULL) AND thumbnail_path IS NOT NULL"#,
)
.bind(library_id)
.fetch_all(pool)
.await
.unwrap_or_default();
let mut deleted_count = 0;
for book_id in &book_ids_to_clear {
let filename = format!("{}.webp", book_id);
let thumbnail_path = Path::new(&config.directory).join(&filename);
if thumbnail_path.exists() {
if let Err(e) = std::fs::remove_file(&thumbnail_path) {
warn!("Failed to delete thumbnail file {}: {}", thumbnail_path.display(), e);
} else {
deleted_count += 1;
}
}
}
info!("thumbnails regenerate: deleted {} thumbnail files", deleted_count);
}
// Clear thumbnail_path in database
let cleared = sqlx::query(
r#"UPDATE books SET thumbnail_path = NULL WHERE (library_id = $1 OR $1 IS NULL)"#,
)
.bind(library_id)
.execute(pool)
.await;
if let Ok(res) = cleared {
info!("thumbnails {}: cleared {} books in database", job_type, res.rows_affected());
}
}
let book_ids: Vec<Uuid> = sqlx::query_scalar(
r#"SELECT id FROM books WHERE (library_id = $1 OR $1 IS NULL) AND thumbnail_path IS NULL"#,
)
.bind(library_id)
.fetch_all(pool)
.await
.unwrap_or_default();
let config = load_thumbnail_config(pool).await;
if !config.enabled || book_ids.is_empty() {
let _ = sqlx::query(
"UPDATE index_jobs SET status = 'success', finished_at = NOW(), progress_percent = 100, current_file = NULL WHERE id = $1",
)
.bind(job_id)
.execute(pool)
.await;
return;
}
let total = book_ids.len() as i32;
let _ = sqlx::query(
"UPDATE index_jobs SET status = 'generating_thumbnails', total_files = $2, processed_files = 0, current_file = NULL WHERE id = $1",
)
.bind(job_id)
.bind(total)
.execute(pool)
.await;
let concurrency = load_thumbnail_concurrency(pool).await;
let processed_count = Arc::new(AtomicI32::new(0));
let pool_clone = pool.clone();
let job_id_clone = job_id;
let config_clone = config.clone();
let state_clone = state.clone();
let total_clone = total;
stream::iter(book_ids)
.for_each_concurrent(concurrency, |book_id| {
let processed_count = processed_count.clone();
let pool = pool_clone.clone();
let job_id = job_id_clone;
let config = config_clone.clone();
let state = state_clone.clone();
let total = total_clone;
async move {
match pages::render_book_page_1(&state, book_id, config.width, config.quality).await {
Ok(page_bytes) => {
match generate_thumbnail(&page_bytes, &config) {
Ok(thumb_bytes) => {
if let Ok(path) = save_thumbnail(book_id, &thumb_bytes, &config) {
if sqlx::query("UPDATE books SET thumbnail_path = $1 WHERE id = $2")
.bind(&path)
.bind(book_id)
.execute(&pool)
.await
.is_ok()
{
let processed = processed_count.fetch_add(1, Ordering::Relaxed) + 1;
let percent = (processed as f64 / total as f64 * 100.0) as i32;
let _ = sqlx::query(
"UPDATE index_jobs SET processed_files = $2, progress_percent = $3 WHERE id = $1",
)
.bind(job_id)
.bind(processed)
.bind(percent)
.execute(&pool)
.await;
}
}
}
Err(e) => warn!("thumbnail generate failed for book {}: {:?}", book_id, e),
}
}
Err(e) => warn!("render page 1 failed for book {}: {:?}", book_id, e),
}
}
})
.await;
let _ = sqlx::query(
"UPDATE index_jobs SET status = 'success', finished_at = NOW(), progress_percent = 100, current_file = NULL WHERE id = $1",
)
.bind(job_id)
.execute(pool)
.await;
info!("thumbnails checkup finished for job {} ({} books)", job_id, total);
}
use crate::{error::ApiError, index_jobs, state::AppState};
#[derive(Deserialize, ToSchema)]
pub struct ThumbnailsRebuildRequest {
@@ -312,7 +14,7 @@ pub struct ThumbnailsRebuildRequest {
pub library_id: Option<Uuid>,
}
/// POST /index/thumbnails/rebuild — create a job and generate thumbnails for books that don't have one (optional library scope).
/// POST /index/thumbnails/rebuild — create a job to generate thumbnails for books that don't have one.
#[utoipa::path(
post,
path = "/index/thumbnails/rebuild",
@@ -346,7 +48,7 @@ pub async fn start_thumbnails_rebuild(
Ok(Json(index_jobs::map_row(row)))
}
/// POST /index/thumbnails/regenerate — create a job and regenerate all thumbnails in scope (clears then regenerates).
/// POST /index/thumbnails/regenerate — create a job to regenerate all thumbnails (clears then regenerates).
#[utoipa::path(
post,
path = "/index/thumbnails/regenerate",
@@ -379,13 +81,3 @@ pub async fn start_thumbnails_regenerate(
Ok(Json(index_jobs::map_row(row)))
}
/// POST /index/jobs/:id/thumbnails/checkup — start thumbnail generation for books missing thumbnails (called by indexer at end of build).
pub async fn start_checkup(
State(state): State<AppState>,
AxumPath(job_id): AxumPath<Uuid>,
) -> Result<StatusCode, ApiError> {
let state = state.clone();
tokio::spawn(async move { run_checkup(state, job_id).await });
Ok(StatusCode::ACCEPTED)
}