From 9c7120c3dc1cd3bace163de2a811787a8f6a21b4 Mon Sep 17 00:00:00 2001 From: Froidefond Julien Date: Sun, 8 Mar 2026 21:07:03 +0100 Subject: [PATCH] feat: enhance library scanning and metadata parsing - Introduce a structured approach to collect book file information before parsing. - Implement parallel processing for metadata extraction to improve performance. - Refactor file handling to utilize a new FileInfo struct for better organization. - Update database interactions to use collected file information for batch inserts. - Improve logging for scanning and parsing processes to provide better insights. --- apps/indexer/src/main.rs | 143 +++++++++++++++++++++++++-------------- 1 file changed, 91 insertions(+), 52 deletions(-) diff --git a/apps/indexer/src/main.rs b/apps/indexer/src/main.rs index c99edaa..1c207b3 100644 --- a/apps/indexer/src/main.rs +++ b/apps/indexer/src/main.rs @@ -3,7 +3,7 @@ use axum::{extract::State, routing::get, Json, Router}; use chrono::{DateTime, Utc}; use axum::http::StatusCode; use notify::{Event, RecommendedWatcher, RecursiveMode, Watcher}; -use parsers::{detect_format, parse_metadata, BookFormat}; +use parsers::{detect_format, parse_metadata, BookFormat, ParsedMetadata}; use rayon::prelude::*; use serde::Serialize; use sha2::{Digest, Sha256}; @@ -878,28 +878,78 @@ async fn scan_library( let mut files_to_insert: Vec = Vec::with_capacity(BATCH_SIZE); let mut errors_to_insert: Vec = Vec::with_capacity(BATCH_SIZE); + // Step 1: Collect all book files first + #[derive(Clone)] + struct FileInfo { + path: std::path::PathBuf, + format: BookFormat, + abs_path: String, + file_name: String, + metadata: std::fs::Metadata, + mtime: DateTime, + fingerprint: String, + lookup_path: String, + } + + let mut file_infos: Vec = Vec::new(); for entry in WalkDir::new(root).into_iter().filter_map(Result::ok) { if !entry.file_type().is_file() { continue; } - let path = entry.path(); - let Some(format) = detect_format(path) else { + let path = entry.path().to_path_buf(); + let Some(format) = detect_format(&path) else { trace!("[SCAN] Skipping non-book file: {}", path.display()); continue; }; info!("[SCAN] Found book file: {} (format: {:?})", path.display(), format); stats.scanned_files += 1; - library_processed_count += 1; - *total_processed_count += 1; + let abs_path_local = path.to_string_lossy().to_string(); let abs_path = unmap_libraries_path(&abs_path_local); let file_name = path.file_name() .map(|s| s.to_string_lossy().to_string()) .unwrap_or_else(|| abs_path.clone()); - let start_time = std::time::Instant::now(); + let metadata = std::fs::metadata(&path) + .with_context(|| format!("cannot stat {}", path.display()))?; + let mtime: DateTime = metadata + .modified() + .map(DateTime::::from) + .unwrap_or_else(|_| Utc::now()); + let fingerprint = compute_fingerprint(&path, metadata.len(), &mtime)?; + let lookup_path = remap_libraries_path(&abs_path); + + file_infos.push(FileInfo { + path, + format, + abs_path, + file_name, + metadata, + mtime, + fingerprint, + lookup_path, + }); + } + + info!("[SCAN] Collected {} files, starting parallel parsing", file_infos.len()); + + // Step 2: Parse metadata in parallel + let parsed_results: Vec<(FileInfo, anyhow::Result)> = file_infos + .into_par_iter() + .map(|file_info| { + let parse_result = parse_metadata(&file_info.path, file_info.format, root); + (file_info, parse_result) + }) + .collect(); + + info!("[SCAN] Completed parallel parsing, processing {} results", parsed_results.len()); + + // Step 3: Process results sequentially for batch inserts + for (file_info, parse_result) in parsed_results { + library_processed_count += 1; + *total_processed_count += 1; // Update progress in DB every 1 second or every 10 files let should_update_progress = last_progress_update.elapsed() > Duration::from_secs(1) || library_processed_count % 10 == 0; @@ -914,7 +964,7 @@ async fn scan_library( "UPDATE index_jobs SET current_file = $2, processed_files = $3, progress_percent = $4 WHERE id = $1" ) .bind(job_id) - .bind(&file_name) + .bind(&file_info.file_name) .bind(*total_processed_count) .bind(progress_percent) .execute(&state.pool) @@ -935,32 +985,23 @@ async fn scan_library( } } - let seen_key = remap_libraries_path(&abs_path); + let seen_key = remap_libraries_path(&file_info.abs_path); seen.insert(seen_key.clone(), true); - let metadata = std::fs::metadata(path) - .with_context(|| format!("cannot stat {}", path.display()))?; - let mtime: DateTime = metadata - .modified() - .map(DateTime::::from) - .unwrap_or_else(|_| Utc::now()); - let fingerprint = compute_fingerprint(path, metadata.len(), &mtime)?; - - let lookup_path = remap_libraries_path(&abs_path); - if let Some((file_id, book_id, old_fingerprint)) = existing.get(&lookup_path).cloned() { - if !is_full_rebuild && old_fingerprint == fingerprint { - trace!("[PROCESS] Skipping unchanged file: {}", file_name); + if let Some((file_id, book_id, old_fingerprint)) = existing.get(&file_info.lookup_path).cloned() { + if !is_full_rebuild && old_fingerprint == file_info.fingerprint { + trace!("[PROCESS] Skipping unchanged file: {}", file_info.file_name); continue; } - info!("[PROCESS] Updating existing file: {} (full_rebuild={}, fingerprint_match={})", file_name, is_full_rebuild, old_fingerprint == fingerprint); + info!("[PROCESS] Updating existing file: {} (full_rebuild={}, fingerprint_match={})", file_info.file_name, is_full_rebuild, old_fingerprint == file_info.fingerprint); - match parse_metadata(path, format, root) { + match parse_result { Ok(parsed) => { books_to_update.push(BookUpdate { book_id, title: parsed.title, - kind: kind_from_format(format).to_string(), + kind: kind_from_format(file_info.format).to_string(), series: parsed.series, volume: parsed.volume, page_count: parsed.page_count, @@ -968,29 +1009,29 @@ async fn scan_library( files_to_update.push(FileUpdate { file_id, - format: format.as_str().to_string(), - size_bytes: metadata.len() as i64, - mtime, - fingerprint, + format: file_info.format.as_str().to_string(), + size_bytes: file_info.metadata.len() as i64, + mtime: file_info.mtime, + fingerprint: file_info.fingerprint, }); stats.indexed_files += 1; } Err(err) => { - warn!("[PARSER] Failed to parse {}: {}", file_name, err); + warn!("[PARSER] Failed to parse {}: {}", file_info.file_name, err); stats.errors += 1; files_to_update.push(FileUpdate { file_id, - format: format.as_str().to_string(), - size_bytes: metadata.len() as i64, - mtime, - fingerprint: fingerprint.clone(), + format: file_info.format.as_str().to_string(), + size_bytes: file_info.metadata.len() as i64, + mtime: file_info.mtime, + fingerprint: file_info.fingerprint.clone(), }); errors_to_insert.push(ErrorInsert { job_id, - file_path: abs_path.clone(), + file_path: file_info.abs_path.clone(), error_message: err.to_string(), }); @@ -1014,17 +1055,17 @@ async fn scan_library( } // New file (thumbnails generated by API after job handoff) - info!("[PROCESS] Inserting new file: {}", file_name); + info!("[PROCESS] Inserting new file: {}", file_info.file_name); let book_id = Uuid::new_v4(); - match parse_metadata(path, format, root) { + match parse_result { Ok(parsed) => { let file_id = Uuid::new_v4(); books_to_insert.push(BookInsert { book_id, library_id, - kind: kind_from_format(format).to_string(), + kind: kind_from_format(file_info.format).to_string(), title: parsed.title, series: parsed.series, volume: parsed.volume, @@ -1035,11 +1076,11 @@ async fn scan_library( files_to_insert.push(FileInsert { file_id, book_id, - format: format.as_str().to_string(), - abs_path: abs_path.clone(), - size_bytes: metadata.len() as i64, - mtime, - fingerprint, + format: file_info.format.as_str().to_string(), + abs_path: file_info.abs_path.clone(), + size_bytes: file_info.metadata.len() as i64, + mtime: file_info.mtime, + fingerprint: file_info.fingerprint, parse_status: "ok".to_string(), parse_error: None, }); @@ -1047,7 +1088,7 @@ async fn scan_library( stats.indexed_files += 1; } Err(err) => { - warn!("[PARSER] Failed to parse {}: {}", file_name, err); + warn!("[PARSER] Failed to parse {}: {}", file_info.file_name, err); stats.errors += 1; let book_id = Uuid::new_v4(); let file_id = Uuid::new_v4(); @@ -1055,8 +1096,8 @@ async fn scan_library( books_to_insert.push(BookInsert { book_id, library_id, - kind: kind_from_format(format).to_string(), - title: file_display_name(path), + kind: kind_from_format(file_info.format).to_string(), + title: file_display_name(&file_info.path), series: None, volume: None, page_count: None, @@ -1066,18 +1107,18 @@ async fn scan_library( files_to_insert.push(FileInsert { file_id, book_id, - format: format.as_str().to_string(), - abs_path: abs_path.clone(), - size_bytes: metadata.len() as i64, - mtime, - fingerprint, + format: file_info.format.as_str().to_string(), + abs_path: file_info.abs_path.clone(), + size_bytes: file_info.metadata.len() as i64, + mtime: file_info.mtime, + fingerprint: file_info.fingerprint, parse_status: "error".to_string(), parse_error: Some(err.to_string()), }); errors_to_insert.push(ErrorInsert { job_id, - file_path: abs_path, + file_path: file_info.abs_path, error_message: err.to_string(), }); } @@ -1087,8 +1128,6 @@ async fn scan_library( if books_to_insert.len() >= BATCH_SIZE || files_to_insert.len() >= BATCH_SIZE { flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?; } - - trace!("[DONE] Processed file {} (total time: {:?})", file_name, start_time.elapsed()); } // Final flush of any remaining items