Files
stripstream-librarian/apps/indexer/src/scanner.rs
Froidefond Julien 6d4c400017 refactor: update AppState references to use state module
- Change all instances of AppState to reference the new state module across multiple files for consistency.
- Clean up imports in auth, books, index_jobs, libraries, pages, search, settings, thumbnails, and tokens modules.
- Simplify main.rs by removing unused code and organizing middleware and route handlers under the new handlers module.
2026-03-08 21:19:22 +01:00

361 lines
14 KiB
Rust

use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use parsers::{detect_format, parse_metadata, BookFormat, ParsedMetadata};
use rayon::prelude::*;
use serde::Serialize;
use sqlx::Row;
use std::{collections::HashMap, path::Path, time::Duration};
use tracing::{error, info, trace, warn};
use uuid::Uuid;
use walkdir::WalkDir;
use crate::{
batch::{flush_all_batches, BookInsert, BookUpdate, ErrorInsert, FileInsert, FileUpdate},
job::is_job_cancelled,
utils,
AppState,
};
#[derive(Serialize)]
pub struct JobStats {
pub scanned_files: usize,
pub indexed_files: usize,
pub removed_files: usize,
pub errors: usize,
}
const BATCH_SIZE: usize = 100;
pub async fn scan_library(
state: &AppState,
job_id: Uuid,
library_id: Uuid,
root: &Path,
stats: &mut JobStats,
total_processed_count: &mut i32,
total_files: usize,
is_full_rebuild: bool,
) -> Result<()> {
info!("[SCAN] Starting scan of library {} at path: {} (full_rebuild={})", library_id, root.display(), is_full_rebuild);
let existing_rows = sqlx::query(
r#"
SELECT bf.id AS file_id, bf.book_id, bf.abs_path, bf.fingerprint
FROM book_files bf
JOIN books b ON b.id = bf.book_id
WHERE b.library_id = $1
"#,
)
.bind(library_id)
.fetch_all(&state.pool)
.await?;
let mut existing: HashMap<String, (Uuid, Uuid, String)> = HashMap::new();
if !is_full_rebuild {
for row in existing_rows {
let abs_path: String = row.get("abs_path");
let remapped_path = utils::remap_libraries_path(&abs_path);
existing.insert(
remapped_path,
(row.get("file_id"), row.get("book_id"), row.get("fingerprint")),
);
}
info!("[SCAN] Found {} existing files in database for library {}", existing.len(), library_id);
} else {
info!("[SCAN] Full rebuild: skipping existing files lookup (all will be treated as new)");
}
let mut seen: HashMap<String, bool> = HashMap::new();
let mut library_processed_count = 0i32;
let mut last_progress_update = std::time::Instant::now();
// Batching buffers
let mut books_to_update: Vec<BookUpdate> = Vec::with_capacity(BATCH_SIZE);
let mut files_to_update: Vec<FileUpdate> = Vec::with_capacity(BATCH_SIZE);
let mut books_to_insert: Vec<BookInsert> = Vec::with_capacity(BATCH_SIZE);
let mut files_to_insert: Vec<FileInsert> = Vec::with_capacity(BATCH_SIZE);
let mut errors_to_insert: Vec<ErrorInsert> = Vec::with_capacity(BATCH_SIZE);
// Step 1: Collect all book files first
#[derive(Clone)]
struct FileInfo {
path: std::path::PathBuf,
format: BookFormat,
abs_path: String,
file_name: String,
metadata: std::fs::Metadata,
mtime: DateTime<Utc>,
fingerprint: String,
lookup_path: String,
}
let mut file_infos: Vec<FileInfo> = Vec::new();
for entry in WalkDir::new(root).into_iter().filter_map(Result::ok) {
if !entry.file_type().is_file() {
continue;
}
let path = entry.path().to_path_buf();
let Some(format) = detect_format(&path) else {
trace!("[SCAN] Skipping non-book file: {}", path.display());
continue;
};
info!("[SCAN] Found book file: {} (format: {:?})", path.display(), format);
stats.scanned_files += 1;
let abs_path_local = path.to_string_lossy().to_string();
let abs_path = utils::unmap_libraries_path(&abs_path_local);
let file_name = path.file_name()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| abs_path.clone());
let metadata = std::fs::metadata(&path)
.with_context(|| format!("cannot stat {}", path.display()))?;
let mtime: DateTime<Utc> = metadata
.modified()
.map(DateTime::<Utc>::from)
.unwrap_or_else(|_| Utc::now());
let fingerprint = utils::compute_fingerprint(&path, metadata.len(), &mtime)?;
let lookup_path = utils::remap_libraries_path(&abs_path);
file_infos.push(FileInfo {
path,
format,
abs_path,
file_name,
metadata,
mtime,
fingerprint,
lookup_path,
});
}
info!("[SCAN] Collected {} files, starting parallel parsing", file_infos.len());
// Step 2: Parse metadata in parallel
let parsed_results: Vec<(FileInfo, Result<ParsedMetadata>)> = file_infos
.into_par_iter()
.map(|file_info| {
let parse_result = parse_metadata(&file_info.path, file_info.format, root);
(file_info, parse_result)
})
.collect();
info!("[SCAN] Completed parallel parsing, processing {} results", parsed_results.len());
// Step 3: Process results sequentially for batch inserts
for (file_info, parse_result) in parsed_results {
library_processed_count += 1;
*total_processed_count += 1;
// Update progress in DB every 1 second or every 10 files
let should_update_progress = last_progress_update.elapsed() > Duration::from_secs(1) || library_processed_count % 10 == 0;
if should_update_progress {
let progress_percent = if total_files > 0 {
((*total_processed_count as f64 / total_files as f64) * 100.0) as i32
} else {
0
};
sqlx::query(
"UPDATE index_jobs SET current_file = $2, processed_files = $3, progress_percent = $4 WHERE id = $1"
)
.bind(job_id)
.bind(&file_info.file_name)
.bind(*total_processed_count)
.bind(progress_percent)
.execute(&state.pool)
.await
.map_err(|e| {
error!("[BDD] Failed to update progress for job {}: {}", job_id, e);
e
})?;
last_progress_update = std::time::Instant::now();
// Check if job has been cancelled
if is_job_cancelled(&state.pool, job_id).await? {
info!("[JOB] Job {} cancelled by user, stopping...", job_id);
// Flush any pending batches before exiting
flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?;
return Err(anyhow::anyhow!("Job cancelled by user"));
}
}
let seen_key = utils::remap_libraries_path(&file_info.abs_path);
seen.insert(seen_key.clone(), true);
if let Some((file_id, book_id, old_fingerprint)) = existing.get(&file_info.lookup_path).cloned() {
if !is_full_rebuild && old_fingerprint == file_info.fingerprint {
trace!("[PROCESS] Skipping unchanged file: {}", file_info.file_name);
continue;
}
info!("[PROCESS] Updating existing file: {} (full_rebuild={}, fingerprint_match={})", file_info.file_name, is_full_rebuild, old_fingerprint == file_info.fingerprint);
match parse_result {
Ok(parsed) => {
books_to_update.push(BookUpdate {
book_id,
title: parsed.title,
kind: utils::kind_from_format(file_info.format).to_string(),
series: parsed.series,
volume: parsed.volume,
page_count: parsed.page_count,
});
files_to_update.push(FileUpdate {
file_id,
format: file_info.format.as_str().to_string(),
size_bytes: file_info.metadata.len() as i64,
mtime: file_info.mtime,
fingerprint: file_info.fingerprint,
});
stats.indexed_files += 1;
}
Err(err) => {
warn!("[PARSER] Failed to parse {}: {}", file_info.file_name, err);
stats.errors += 1;
files_to_update.push(FileUpdate {
file_id,
format: file_info.format.as_str().to_string(),
size_bytes: file_info.metadata.len() as i64,
mtime: file_info.mtime,
fingerprint: file_info.fingerprint.clone(),
});
errors_to_insert.push(ErrorInsert {
job_id,
file_path: file_info.abs_path.clone(),
error_message: err.to_string(),
});
// Also need to mark file as error - we'll do this separately
sqlx::query(
"UPDATE book_files SET parse_status = 'error', parse_error_opt = $2 WHERE id = $1"
)
.bind(file_id)
.bind(err.to_string())
.execute(&state.pool)
.await?;
}
}
// Flush if batch is full
if books_to_update.len() >= BATCH_SIZE || files_to_update.len() >= BATCH_SIZE {
flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?;
}
continue;
}
// New file (thumbnails generated by API after job handoff)
info!("[PROCESS] Inserting new file: {}", file_info.file_name);
let book_id = Uuid::new_v4();
match parse_result {
Ok(parsed) => {
let file_id = Uuid::new_v4();
books_to_insert.push(BookInsert {
book_id,
library_id,
kind: utils::kind_from_format(file_info.format).to_string(),
title: parsed.title,
series: parsed.series,
volume: parsed.volume,
page_count: parsed.page_count,
thumbnail_path: None,
});
files_to_insert.push(FileInsert {
file_id,
book_id,
format: file_info.format.as_str().to_string(),
abs_path: file_info.abs_path.clone(),
size_bytes: file_info.metadata.len() as i64,
mtime: file_info.mtime,
fingerprint: file_info.fingerprint,
parse_status: "ok".to_string(),
parse_error: None,
});
stats.indexed_files += 1;
}
Err(err) => {
warn!("[PARSER] Failed to parse {}: {}", file_info.file_name, err);
stats.errors += 1;
let book_id = Uuid::new_v4();
let file_id = Uuid::new_v4();
books_to_insert.push(BookInsert {
book_id,
library_id,
kind: utils::kind_from_format(file_info.format).to_string(),
title: utils::file_display_name(&file_info.path),
series: None,
volume: None,
page_count: None,
thumbnail_path: None,
});
files_to_insert.push(FileInsert {
file_id,
book_id,
format: file_info.format.as_str().to_string(),
abs_path: file_info.abs_path.clone(),
size_bytes: file_info.metadata.len() as i64,
mtime: file_info.mtime,
fingerprint: file_info.fingerprint,
parse_status: "error".to_string(),
parse_error: Some(err.to_string()),
});
errors_to_insert.push(ErrorInsert {
job_id,
file_path: file_info.abs_path,
error_message: err.to_string(),
});
}
}
// Flush if batch is full
if books_to_insert.len() >= BATCH_SIZE || files_to_insert.len() >= BATCH_SIZE {
flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?;
}
}
// Final flush of any remaining items
flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?;
info!("[SCAN] Library {} scan complete: {} files scanned, {} indexed, {} errors",
library_id, library_processed_count, stats.indexed_files, stats.errors);
// Handle deletions
let mut removed_count = 0usize;
for (abs_path, (file_id, book_id, _)) in existing {
if seen.contains_key(&abs_path) {
continue;
}
sqlx::query("DELETE FROM book_files WHERE id = $1")
.bind(file_id)
.execute(&state.pool)
.await?;
sqlx::query("DELETE FROM books WHERE id = $1 AND NOT EXISTS (SELECT 1 FROM book_files WHERE book_id = $1)")
.bind(book_id)
.execute(&state.pool)
.await?;
stats.removed_files += 1;
removed_count += 1;
}
if removed_count > 0 {
info!("[SCAN] Removed {} stale files from database", removed_count);
}
Ok(())
}