feat: two-phase indexation with direct thumbnail generation in indexer

Phase 1 (discovery): walkdir + filename-only metadata, zero archive I/O.
Books are visible immediately in the UI while Phase 2 runs in background.

Phase 2 (analysis): open each archive once via analyze_book() to extract
page_count and first page bytes, then generate WebP thumbnail directly in
the indexer — removing the HTTP roundtrip to the API checkup endpoint.

- Add parse_metadata_fast() (infallible, no archive I/O)
- Add analyze_book() returning (page_count, first_page_bytes) in one pass
- Add looks_like_image() magic bytes check for unrar p stdout validation
- Add lsar fallback in list_cbr_images() for UTF-16BE encoded filenames
- Add directory_mtimes table to skip unchanged dirs on incremental scans
- Add analyzer.rs: generate_thumbnail, analyze_library_books, regenerate_thumbnails
- Remove run_checkup() from API; indexer handles thumbnail jobs directly
- Remove api_base_url/api_bootstrap_token from IndexerConfig and AppState
- Add unar + poppler-utils to indexer Dockerfile
- Fix smoke.sh: wait for job completion, check thumbnail_url field

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-09 22:13:05 +01:00
parent 36af34443e
commit cfc896e92f
22 changed files with 1274 additions and 768 deletions

View File

@@ -32,10 +32,6 @@ pub struct IndexerConfig {
pub meili_master_key: String,
pub scan_interval_seconds: u64,
pub thumbnail_config: ThumbnailConfig,
/// API base URL for thumbnail checkup at end of build (e.g. http://api:7080)
pub api_base_url: String,
/// Token to call API (e.g. API_BOOTSTRAP_TOKEN)
pub api_bootstrap_token: String,
}
#[derive(Debug, Clone)]
@@ -97,10 +93,6 @@ impl IndexerConfig {
.and_then(|v| v.parse::<u64>().ok())
.unwrap_or(5),
thumbnail_config,
api_base_url: std::env::var("API_BASE_URL")
.unwrap_or_else(|_| "http://api:7080".to_string()),
api_bootstrap_token: std::env::var("API_BOOTSTRAP_TOKEN")
.context("API_BOOTSTRAP_TOKEN is required for thumbnail checkup")?,
})
}
}

View File

@@ -2,6 +2,7 @@ use anyhow::{Context, Result};
use std::io::Read;
use std::path::Path;
use std::process::Command;
use std::sync::OnceLock;
use uuid::Uuid;
use walkdir::WalkDir;
@@ -40,38 +41,52 @@ pub fn detect_format(path: &Path) -> Option<BookFormat> {
}
}
pub fn parse_metadata(
path: &Path,
format: BookFormat,
library_root: &Path,
) -> Result<ParsedMetadata> {
let filename = path
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| "Untitled".to_string());
// Cache compiled regex patterns — compiled once on first use
static VOLUME_PATTERNS: OnceLock<Vec<(regex::Regex, usize)>> = OnceLock::new();
// Extract volume from filename (patterns: T01, T02, Vol 1, Volume 1, #1, - 01, etc.)
let volume = extract_volume(&filename);
fn get_volume_patterns() -> &'static Vec<(regex::Regex, usize)> {
VOLUME_PATTERNS.get_or_init(|| {
[
// T01, T02 pattern (most common for manga/comics)
(r"(?i)T(\d+)", 1usize),
// Vol 1, Vol. 1, Volume 1
(r"(?i)Vol\.?\s*(\d+)", 1),
(r"(?i)Volume\s*(\d+)", 1),
// #1, #01
(r"#(\d+)", 1),
// - 1, - 01 at the end
(r"-\s*(\d+)\s*$", 1),
]
.iter()
.filter_map(|(pattern, group)| {
regex::Regex::new(pattern).ok().map(|re| (re, *group))
})
.collect()
})
}
// Keep original filename as title (don't clean it)
let title = filename;
fn extract_volume(filename: &str) -> Option<i32> {
for (re, group) in get_volume_patterns() {
if let Some(caps) = re.captures(filename) {
if let Some(mat) = caps.get(*group) {
return mat.as_str().parse::<i32>().ok();
}
}
}
None
}
// Determine series from parent folder relative to library root
let series = path.parent().and_then(|parent| {
// Normalize paths for comparison (handle different separators, etc.)
fn extract_series(path: &Path, library_root: &Path) -> Option<String> {
path.parent().and_then(|parent| {
let parent_str = parent.to_string_lossy().to_string();
let root_str = library_root.to_string_lossy().to_string();
// Try to find the library root in the parent path
let relative = if let Some(idx) = parent_str.find(&root_str) {
// Found root in parent, extract what comes after
let after_root = &parent_str[idx + root_str.len()..];
Path::new(after_root)
} else if let Some(relative) = parent.strip_prefix(library_root).ok() {
// Standard approach works
} else if let Ok(relative) = parent.strip_prefix(library_root) {
relative
} else {
// Log for diagnostic on server
eprintln!(
"[PARSER] Cannot determine series: parent '{}' doesn't start with root '{}'",
parent.display(),
@@ -80,16 +95,14 @@ pub fn parse_metadata(
return None;
};
// Remove leading separators
let relative_str = relative.to_string_lossy().to_string();
let relative_clean = relative_str.trim_start_matches(|c| c == '/' || c == '\\');
let relative_clean = relative_str.trim_start_matches(['/', '\\']);
if relative_clean.is_empty() {
return None;
}
// Get first component as series
let first_sep = relative_clean.find(|c| c == '/' || c == '\\');
let first_sep = relative_clean.find(['/', '\\']);
let series_name = match first_sep {
Some(idx) => &relative_clean[..idx],
None => relative_clean,
@@ -100,80 +113,178 @@ pub fn parse_metadata(
} else {
Some(series_name.to_string())
}
});
})
}
let page_count = match format {
/// Fast metadata extraction from filename only — no archive I/O. Always succeeds.
pub fn parse_metadata_fast(path: &Path, _format: BookFormat, library_root: &Path) -> ParsedMetadata {
let filename = path
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| "Untitled".to_string());
let volume = extract_volume(&filename);
let title = filename;
let series = extract_series(path, library_root);
ParsedMetadata {
title,
series,
volume,
page_count: None,
}
}
pub fn parse_metadata(
path: &Path,
format: BookFormat,
library_root: &Path,
) -> Result<ParsedMetadata> {
let mut meta = parse_metadata_fast(path, format, library_root);
meta.page_count = match format {
BookFormat::Cbz => parse_cbz_page_count(path).ok(),
BookFormat::Cbr => parse_cbr_page_count(path).ok(),
BookFormat::Pdf => parse_pdf_page_count(path).ok(),
};
Ok(ParsedMetadata {
title,
series,
volume,
page_count,
})
Ok(meta)
}
fn extract_volume(filename: &str) -> Option<i32> {
// Common volume patterns: T01, T02, T1, T2, Vol 1, Vol. 1, Volume 1, #1, #01, - 1, - 01
let patterns = [
// T01, T02 pattern (most common for manga/comics)
(r"(?i)T(\d+)", 1),
// Vol 1, Vol. 1, Volume 1
(r"(?i)Vol\.?\s*(\d+)", 1),
(r"(?i)Volume\s*(\d+)", 1),
// #1, #01
(r"#(\d+)", 1),
// - 1, - 01 at the end
(r"-\s*(\d+)\s*$", 1),
];
/// Open an archive once and return (page_count, first_page_bytes).
/// This is more efficient than calling parse_metadata + extract_first_page separately.
pub fn analyze_book(path: &Path, format: BookFormat) -> Result<(i32, Vec<u8>)> {
match format {
BookFormat::Cbz => analyze_cbz(path),
BookFormat::Cbr => analyze_cbr(path),
BookFormat::Pdf => analyze_pdf(path),
}
}
for (pattern, group) in &patterns {
if let Ok(re) = regex::Regex::new(pattern) {
if let Some(caps) = re.captures(filename) {
if let Some(mat) = caps.get(*group) {
// Parse as integer to remove leading zeros
return mat.as_str().parse::<i32>().ok();
}
}
fn analyze_cbz(path: &Path) -> Result<(i32, Vec<u8>)> {
let file = std::fs::File::open(path)
.with_context(|| format!("cannot open cbz: {}", path.display()))?;
let mut archive = zip::ZipArchive::new(file).context("invalid cbz archive")?;
let mut image_names: Vec<String> = Vec::new();
for i in 0..archive.len() {
let entry = archive.by_index(i).context("cannot read cbz entry")?;
let name = entry.name().to_ascii_lowercase();
if is_image_name(&name) {
image_names.push(entry.name().to_string());
}
}
image_names.sort();
let count = image_names.len() as i32;
let first_image = image_names.first().context("no images found in cbz")?;
let mut entry = archive
.by_name(first_image)
.context("cannot read first image")?;
let mut buf = Vec::new();
entry.read_to_end(&mut buf)?;
Ok((count, buf))
}
fn list_cbr_images(path: &Path) -> Result<Vec<String>> {
// Try unrar lb first (fast)
let output = std::process::Command::new("unrar")
.arg("lb")
.arg(path)
.output()
.with_context(|| format!("failed to execute unrar lb for {}", path.display()))?;
if output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
let images: Vec<String> = stdout
.lines()
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
.map(|l| l.to_string())
.collect();
if !images.is_empty() {
return Ok(images);
}
}
None
// Fallback: lsar (from unar package) handles UTF-16BE encoded filenames
let lsar_output = std::process::Command::new("lsar")
.arg(path)
.output()
.with_context(|| format!("failed to execute lsar for {}", path.display()))?;
if !lsar_output.status.success() {
return Err(anyhow::anyhow!(
"both unrar lb and lsar failed for {}",
path.display()
));
}
let stdout = String::from_utf8_lossy(&lsar_output.stdout);
// lsar output: first line is archive info, then one file per line (indented)
let images: Vec<String> = stdout
.lines()
.skip(1) // skip the archive header line
.map(|l| l.trim().to_string())
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
.collect();
Ok(images)
}
#[allow(dead_code)]
fn clean_title(filename: &str) -> String {
// Remove volume patterns from title to clean it up
let cleaned = regex::Regex::new(r"(?i)\s*T\d+\s*")
.ok()
.and_then(|re| Some(re.replace_all(filename, " ").to_string()))
.unwrap_or_else(|| filename.to_string());
fn analyze_cbr(path: &Path) -> Result<(i32, Vec<u8>)> {
let mut image_names = list_cbr_images(path)?;
image_names.sort();
let cleaned = regex::Regex::new(r"(?i)\s*Vol\.?\s*\d+\s*")
.ok()
.and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
.unwrap_or_else(|| cleaned);
let count = image_names.len() as i32;
if count == 0 {
return Err(anyhow::anyhow!("no images found in cbr: {}", path.display()));
}
let cleaned = regex::Regex::new(r"(?i)\s*Volume\s*\d+\s*")
.ok()
.and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
.unwrap_or_else(|| cleaned);
let first_name = &image_names[0];
let cleaned = regex::Regex::new(r"#\d+")
.ok()
.and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
.unwrap_or_else(|| cleaned);
// Try unrar p to extract first image to stdout (faster — no temp dir)
let p_output = std::process::Command::new("unrar")
.args(["p", "-inul"])
.arg(path)
.arg(first_name)
.output();
let cleaned = regex::Regex::new(r"-\s*\d+\s*$")
.ok()
.and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
.unwrap_or_else(|| cleaned);
match p_output {
Ok(out) if out.status.success() && looks_like_image(&out.stdout) => Ok((count, out.stdout)),
_ => {
// Fallback: full extraction with unar (handles special chars, encoding issues)
let image_bytes = extract_cbr_first_page(path)?;
Ok((count, image_bytes))
}
}
}
// Clean up extra spaces
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
/// Check image magic bytes to validate that bytes are a real image before decoding.
fn looks_like_image(bytes: &[u8]) -> bool {
if bytes.len() < 12 {
return false;
}
// JPEG: FF D8 FF
if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
return true;
}
// PNG: 89 50 4E 47 0D 0A 1A 0A
if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
return true;
}
// WebP: RIFF....WEBP
if &bytes[0..4] == b"RIFF" && &bytes[8..12] == b"WEBP" {
return true;
}
false
}
fn analyze_pdf(path: &Path) -> Result<(i32, Vec<u8>)> {
let count = parse_pdf_page_count(path)?;
let image_bytes = extract_pdf_first_page(path)?;
Ok((count, image_bytes))
}
fn parse_cbz_page_count(path: &Path) -> Result<i32> {
@@ -192,26 +303,11 @@ fn parse_cbz_page_count(path: &Path) -> Result<i32> {
}
fn parse_cbr_page_count(path: &Path) -> Result<i32> {
let output = std::process::Command::new("unrar")
.arg("lb")
.arg(path)
.output()
.with_context(|| format!("failed to execute unrar for {}", path.display()))?;
if !output.status.success() {
return Err(anyhow::anyhow!("unrar failed for {}", path.display()));
}
let stdout = String::from_utf8_lossy(&output.stdout);
let count = stdout
.lines()
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
.count() as i32;
Ok(count)
let images = list_cbr_images(path)?;
Ok(images.len() as i32)
}
fn parse_pdf_page_count(path: &Path) -> Result<i32> {
// Use pdfinfo command line tool instead of lopdf for better performance
let output = std::process::Command::new("pdfinfo")
.arg(path)
.output()
@@ -238,6 +334,10 @@ fn parse_pdf_page_count(path: &Path) -> Result<i32> {
}
fn is_image_name(name: &str) -> bool {
// Skip macOS metadata entries (__MACOSX/ prefix or AppleDouble ._* files)
if name.starts_with("__macosx/") || name.contains("/._") || name.starts_with("._") {
return false;
}
name.ends_with(".jpg")
|| name.ends_with(".jpeg")
|| name.ends_with(".png")
@@ -282,7 +382,6 @@ fn extract_cbr_first_page(path: &Path) -> Result<Vec<u8>> {
let tmp_dir = std::env::temp_dir().join(format!("stripstream-cbr-thumb-{}", Uuid::new_v4()));
std::fs::create_dir_all(&tmp_dir).context("cannot create temp dir")?;
// Use env command like the API does
let output = std::process::Command::new("env")
.args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"])
.arg(&tmp_dir)
@@ -298,7 +397,6 @@ fn extract_cbr_first_page(path: &Path) -> Result<Vec<u8>> {
));
}
// Use WalkDir for recursive search (CBR can have subdirectories)
let mut image_files: Vec<_> = WalkDir::new(&tmp_dir)
.into_iter()
.filter_map(|e| e.ok())
@@ -346,3 +444,33 @@ fn extract_pdf_first_page(path: &Path) -> Result<Vec<u8>> {
let _ = std::fs::remove_dir_all(&tmp_dir);
Ok(data)
}
#[allow(dead_code)]
fn clean_title(filename: &str) -> String {
let cleaned = regex::Regex::new(r"(?i)\s*T\d+\s*")
.ok()
.map(|re| re.replace_all(filename, " ").to_string())
.unwrap_or_else(|| filename.to_string());
let cleaned = regex::Regex::new(r"(?i)\s*Vol\.?\s*\d+\s*")
.ok()
.map(|re| re.replace_all(&cleaned, " ").to_string())
.unwrap_or(cleaned);
let cleaned = regex::Regex::new(r"(?i)\s*Volume\s*\d+\s*")
.ok()
.map(|re| re.replace_all(&cleaned, " ").to_string())
.unwrap_or(cleaned);
let cleaned = regex::Regex::new(r"#\d+")
.ok()
.map(|re| re.replace_all(&cleaned, " ").to_string())
.unwrap_or(cleaned);
let cleaned = regex::Regex::new(r"-\s*\d+\s*$")
.ok()
.map(|re| re.replace_all(&cleaned, " ").to_string())
.unwrap_or(cleaned);
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
}