feat: two-phase indexation with direct thumbnail generation in indexer
Phase 1 (discovery): walkdir + filename-only metadata, zero archive I/O. Books are visible immediately in the UI while Phase 2 runs in background. Phase 2 (analysis): open each archive once via analyze_book() to extract page_count and first page bytes, then generate WebP thumbnail directly in the indexer — removing the HTTP roundtrip to the API checkup endpoint. - Add parse_metadata_fast() (infallible, no archive I/O) - Add analyze_book() returning (page_count, first_page_bytes) in one pass - Add looks_like_image() magic bytes check for unrar p stdout validation - Add lsar fallback in list_cbr_images() for UTF-16BE encoded filenames - Add directory_mtimes table to skip unchanged dirs on incremental scans - Add analyzer.rs: generate_thumbnail, analyze_library_books, regenerate_thumbnails - Remove run_checkup() from API; indexer handles thumbnail jobs directly - Remove api_base_url/api_bootstrap_token from IndexerConfig and AppState - Add unar + poppler-utils to indexer Dockerfile - Fix smoke.sh: wait for job completion, check thumbnail_url field Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -32,10 +32,6 @@ pub struct IndexerConfig {
|
||||
pub meili_master_key: String,
|
||||
pub scan_interval_seconds: u64,
|
||||
pub thumbnail_config: ThumbnailConfig,
|
||||
/// API base URL for thumbnail checkup at end of build (e.g. http://api:7080)
|
||||
pub api_base_url: String,
|
||||
/// Token to call API (e.g. API_BOOTSTRAP_TOKEN)
|
||||
pub api_bootstrap_token: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -97,10 +93,6 @@ impl IndexerConfig {
|
||||
.and_then(|v| v.parse::<u64>().ok())
|
||||
.unwrap_or(5),
|
||||
thumbnail_config,
|
||||
api_base_url: std::env::var("API_BASE_URL")
|
||||
.unwrap_or_else(|_| "http://api:7080".to_string()),
|
||||
api_bootstrap_token: std::env::var("API_BOOTSTRAP_TOKEN")
|
||||
.context("API_BOOTSTRAP_TOKEN is required for thumbnail checkup")?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use anyhow::{Context, Result};
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::sync::OnceLock;
|
||||
use uuid::Uuid;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
@@ -40,38 +41,52 @@ pub fn detect_format(path: &Path) -> Option<BookFormat> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_metadata(
|
||||
path: &Path,
|
||||
format: BookFormat,
|
||||
library_root: &Path,
|
||||
) -> Result<ParsedMetadata> {
|
||||
let filename = path
|
||||
.file_stem()
|
||||
.map(|s| s.to_string_lossy().to_string())
|
||||
.unwrap_or_else(|| "Untitled".to_string());
|
||||
// Cache compiled regex patterns — compiled once on first use
|
||||
static VOLUME_PATTERNS: OnceLock<Vec<(regex::Regex, usize)>> = OnceLock::new();
|
||||
|
||||
// Extract volume from filename (patterns: T01, T02, Vol 1, Volume 1, #1, - 01, etc.)
|
||||
let volume = extract_volume(&filename);
|
||||
fn get_volume_patterns() -> &'static Vec<(regex::Regex, usize)> {
|
||||
VOLUME_PATTERNS.get_or_init(|| {
|
||||
[
|
||||
// T01, T02 pattern (most common for manga/comics)
|
||||
(r"(?i)T(\d+)", 1usize),
|
||||
// Vol 1, Vol. 1, Volume 1
|
||||
(r"(?i)Vol\.?\s*(\d+)", 1),
|
||||
(r"(?i)Volume\s*(\d+)", 1),
|
||||
// #1, #01
|
||||
(r"#(\d+)", 1),
|
||||
// - 1, - 01 at the end
|
||||
(r"-\s*(\d+)\s*$", 1),
|
||||
]
|
||||
.iter()
|
||||
.filter_map(|(pattern, group)| {
|
||||
regex::Regex::new(pattern).ok().map(|re| (re, *group))
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
}
|
||||
|
||||
// Keep original filename as title (don't clean it)
|
||||
let title = filename;
|
||||
fn extract_volume(filename: &str) -> Option<i32> {
|
||||
for (re, group) in get_volume_patterns() {
|
||||
if let Some(caps) = re.captures(filename) {
|
||||
if let Some(mat) = caps.get(*group) {
|
||||
return mat.as_str().parse::<i32>().ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
// Determine series from parent folder relative to library root
|
||||
let series = path.parent().and_then(|parent| {
|
||||
// Normalize paths for comparison (handle different separators, etc.)
|
||||
fn extract_series(path: &Path, library_root: &Path) -> Option<String> {
|
||||
path.parent().and_then(|parent| {
|
||||
let parent_str = parent.to_string_lossy().to_string();
|
||||
let root_str = library_root.to_string_lossy().to_string();
|
||||
|
||||
// Try to find the library root in the parent path
|
||||
let relative = if let Some(idx) = parent_str.find(&root_str) {
|
||||
// Found root in parent, extract what comes after
|
||||
let after_root = &parent_str[idx + root_str.len()..];
|
||||
Path::new(after_root)
|
||||
} else if let Some(relative) = parent.strip_prefix(library_root).ok() {
|
||||
// Standard approach works
|
||||
} else if let Ok(relative) = parent.strip_prefix(library_root) {
|
||||
relative
|
||||
} else {
|
||||
// Log for diagnostic on server
|
||||
eprintln!(
|
||||
"[PARSER] Cannot determine series: parent '{}' doesn't start with root '{}'",
|
||||
parent.display(),
|
||||
@@ -80,16 +95,14 @@ pub fn parse_metadata(
|
||||
return None;
|
||||
};
|
||||
|
||||
// Remove leading separators
|
||||
let relative_str = relative.to_string_lossy().to_string();
|
||||
let relative_clean = relative_str.trim_start_matches(|c| c == '/' || c == '\\');
|
||||
let relative_clean = relative_str.trim_start_matches(['/', '\\']);
|
||||
|
||||
if relative_clean.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Get first component as series
|
||||
let first_sep = relative_clean.find(|c| c == '/' || c == '\\');
|
||||
let first_sep = relative_clean.find(['/', '\\']);
|
||||
let series_name = match first_sep {
|
||||
Some(idx) => &relative_clean[..idx],
|
||||
None => relative_clean,
|
||||
@@ -100,80 +113,178 @@ pub fn parse_metadata(
|
||||
} else {
|
||||
Some(series_name.to_string())
|
||||
}
|
||||
});
|
||||
})
|
||||
}
|
||||
|
||||
let page_count = match format {
|
||||
/// Fast metadata extraction from filename only — no archive I/O. Always succeeds.
|
||||
pub fn parse_metadata_fast(path: &Path, _format: BookFormat, library_root: &Path) -> ParsedMetadata {
|
||||
let filename = path
|
||||
.file_stem()
|
||||
.map(|s| s.to_string_lossy().to_string())
|
||||
.unwrap_or_else(|| "Untitled".to_string());
|
||||
|
||||
let volume = extract_volume(&filename);
|
||||
let title = filename;
|
||||
let series = extract_series(path, library_root);
|
||||
|
||||
ParsedMetadata {
|
||||
title,
|
||||
series,
|
||||
volume,
|
||||
page_count: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_metadata(
|
||||
path: &Path,
|
||||
format: BookFormat,
|
||||
library_root: &Path,
|
||||
) -> Result<ParsedMetadata> {
|
||||
let mut meta = parse_metadata_fast(path, format, library_root);
|
||||
|
||||
meta.page_count = match format {
|
||||
BookFormat::Cbz => parse_cbz_page_count(path).ok(),
|
||||
BookFormat::Cbr => parse_cbr_page_count(path).ok(),
|
||||
BookFormat::Pdf => parse_pdf_page_count(path).ok(),
|
||||
};
|
||||
|
||||
Ok(ParsedMetadata {
|
||||
title,
|
||||
series,
|
||||
volume,
|
||||
page_count,
|
||||
})
|
||||
Ok(meta)
|
||||
}
|
||||
|
||||
fn extract_volume(filename: &str) -> Option<i32> {
|
||||
// Common volume patterns: T01, T02, T1, T2, Vol 1, Vol. 1, Volume 1, #1, #01, - 1, - 01
|
||||
let patterns = [
|
||||
// T01, T02 pattern (most common for manga/comics)
|
||||
(r"(?i)T(\d+)", 1),
|
||||
// Vol 1, Vol. 1, Volume 1
|
||||
(r"(?i)Vol\.?\s*(\d+)", 1),
|
||||
(r"(?i)Volume\s*(\d+)", 1),
|
||||
// #1, #01
|
||||
(r"#(\d+)", 1),
|
||||
// - 1, - 01 at the end
|
||||
(r"-\s*(\d+)\s*$", 1),
|
||||
];
|
||||
/// Open an archive once and return (page_count, first_page_bytes).
|
||||
/// This is more efficient than calling parse_metadata + extract_first_page separately.
|
||||
pub fn analyze_book(path: &Path, format: BookFormat) -> Result<(i32, Vec<u8>)> {
|
||||
match format {
|
||||
BookFormat::Cbz => analyze_cbz(path),
|
||||
BookFormat::Cbr => analyze_cbr(path),
|
||||
BookFormat::Pdf => analyze_pdf(path),
|
||||
}
|
||||
}
|
||||
|
||||
for (pattern, group) in &patterns {
|
||||
if let Ok(re) = regex::Regex::new(pattern) {
|
||||
if let Some(caps) = re.captures(filename) {
|
||||
if let Some(mat) = caps.get(*group) {
|
||||
// Parse as integer to remove leading zeros
|
||||
return mat.as_str().parse::<i32>().ok();
|
||||
}
|
||||
}
|
||||
fn analyze_cbz(path: &Path) -> Result<(i32, Vec<u8>)> {
|
||||
let file = std::fs::File::open(path)
|
||||
.with_context(|| format!("cannot open cbz: {}", path.display()))?;
|
||||
let mut archive = zip::ZipArchive::new(file).context("invalid cbz archive")?;
|
||||
|
||||
let mut image_names: Vec<String> = Vec::new();
|
||||
for i in 0..archive.len() {
|
||||
let entry = archive.by_index(i).context("cannot read cbz entry")?;
|
||||
let name = entry.name().to_ascii_lowercase();
|
||||
if is_image_name(&name) {
|
||||
image_names.push(entry.name().to_string());
|
||||
}
|
||||
}
|
||||
image_names.sort();
|
||||
|
||||
let count = image_names.len() as i32;
|
||||
let first_image = image_names.first().context("no images found in cbz")?;
|
||||
|
||||
let mut entry = archive
|
||||
.by_name(first_image)
|
||||
.context("cannot read first image")?;
|
||||
let mut buf = Vec::new();
|
||||
entry.read_to_end(&mut buf)?;
|
||||
|
||||
Ok((count, buf))
|
||||
}
|
||||
|
||||
fn list_cbr_images(path: &Path) -> Result<Vec<String>> {
|
||||
// Try unrar lb first (fast)
|
||||
let output = std::process::Command::new("unrar")
|
||||
.arg("lb")
|
||||
.arg(path)
|
||||
.output()
|
||||
.with_context(|| format!("failed to execute unrar lb for {}", path.display()))?;
|
||||
|
||||
if output.status.success() {
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let images: Vec<String> = stdout
|
||||
.lines()
|
||||
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
|
||||
.map(|l| l.to_string())
|
||||
.collect();
|
||||
if !images.is_empty() {
|
||||
return Ok(images);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
// Fallback: lsar (from unar package) handles UTF-16BE encoded filenames
|
||||
let lsar_output = std::process::Command::new("lsar")
|
||||
.arg(path)
|
||||
.output()
|
||||
.with_context(|| format!("failed to execute lsar for {}", path.display()))?;
|
||||
|
||||
if !lsar_output.status.success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"both unrar lb and lsar failed for {}",
|
||||
path.display()
|
||||
));
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&lsar_output.stdout);
|
||||
// lsar output: first line is archive info, then one file per line (indented)
|
||||
let images: Vec<String> = stdout
|
||||
.lines()
|
||||
.skip(1) // skip the archive header line
|
||||
.map(|l| l.trim().to_string())
|
||||
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
|
||||
.collect();
|
||||
|
||||
Ok(images)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn clean_title(filename: &str) -> String {
|
||||
// Remove volume patterns from title to clean it up
|
||||
let cleaned = regex::Regex::new(r"(?i)\s*T\d+\s*")
|
||||
.ok()
|
||||
.and_then(|re| Some(re.replace_all(filename, " ").to_string()))
|
||||
.unwrap_or_else(|| filename.to_string());
|
||||
fn analyze_cbr(path: &Path) -> Result<(i32, Vec<u8>)> {
|
||||
let mut image_names = list_cbr_images(path)?;
|
||||
image_names.sort();
|
||||
|
||||
let cleaned = regex::Regex::new(r"(?i)\s*Vol\.?\s*\d+\s*")
|
||||
.ok()
|
||||
.and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
|
||||
.unwrap_or_else(|| cleaned);
|
||||
let count = image_names.len() as i32;
|
||||
if count == 0 {
|
||||
return Err(anyhow::anyhow!("no images found in cbr: {}", path.display()));
|
||||
}
|
||||
|
||||
let cleaned = regex::Regex::new(r"(?i)\s*Volume\s*\d+\s*")
|
||||
.ok()
|
||||
.and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
|
||||
.unwrap_or_else(|| cleaned);
|
||||
let first_name = &image_names[0];
|
||||
|
||||
let cleaned = regex::Regex::new(r"#\d+")
|
||||
.ok()
|
||||
.and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
|
||||
.unwrap_or_else(|| cleaned);
|
||||
// Try unrar p to extract first image to stdout (faster — no temp dir)
|
||||
let p_output = std::process::Command::new("unrar")
|
||||
.args(["p", "-inul"])
|
||||
.arg(path)
|
||||
.arg(first_name)
|
||||
.output();
|
||||
|
||||
let cleaned = regex::Regex::new(r"-\s*\d+\s*$")
|
||||
.ok()
|
||||
.and_then(|re| Some(re.replace_all(&cleaned, " ").to_string()))
|
||||
.unwrap_or_else(|| cleaned);
|
||||
match p_output {
|
||||
Ok(out) if out.status.success() && looks_like_image(&out.stdout) => Ok((count, out.stdout)),
|
||||
_ => {
|
||||
// Fallback: full extraction with unar (handles special chars, encoding issues)
|
||||
let image_bytes = extract_cbr_first_page(path)?;
|
||||
Ok((count, image_bytes))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up extra spaces
|
||||
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||
/// Check image magic bytes to validate that bytes are a real image before decoding.
|
||||
fn looks_like_image(bytes: &[u8]) -> bool {
|
||||
if bytes.len() < 12 {
|
||||
return false;
|
||||
}
|
||||
// JPEG: FF D8 FF
|
||||
if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
||||
return true;
|
||||
}
|
||||
// PNG: 89 50 4E 47 0D 0A 1A 0A
|
||||
if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
|
||||
return true;
|
||||
}
|
||||
// WebP: RIFF....WEBP
|
||||
if &bytes[0..4] == b"RIFF" && &bytes[8..12] == b"WEBP" {
|
||||
return true;
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn analyze_pdf(path: &Path) -> Result<(i32, Vec<u8>)> {
|
||||
let count = parse_pdf_page_count(path)?;
|
||||
let image_bytes = extract_pdf_first_page(path)?;
|
||||
Ok((count, image_bytes))
|
||||
}
|
||||
|
||||
fn parse_cbz_page_count(path: &Path) -> Result<i32> {
|
||||
@@ -192,26 +303,11 @@ fn parse_cbz_page_count(path: &Path) -> Result<i32> {
|
||||
}
|
||||
|
||||
fn parse_cbr_page_count(path: &Path) -> Result<i32> {
|
||||
let output = std::process::Command::new("unrar")
|
||||
.arg("lb")
|
||||
.arg(path)
|
||||
.output()
|
||||
.with_context(|| format!("failed to execute unrar for {}", path.display()))?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!("unrar failed for {}", path.display()));
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let count = stdout
|
||||
.lines()
|
||||
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
|
||||
.count() as i32;
|
||||
Ok(count)
|
||||
let images = list_cbr_images(path)?;
|
||||
Ok(images.len() as i32)
|
||||
}
|
||||
|
||||
fn parse_pdf_page_count(path: &Path) -> Result<i32> {
|
||||
// Use pdfinfo command line tool instead of lopdf for better performance
|
||||
let output = std::process::Command::new("pdfinfo")
|
||||
.arg(path)
|
||||
.output()
|
||||
@@ -238,6 +334,10 @@ fn parse_pdf_page_count(path: &Path) -> Result<i32> {
|
||||
}
|
||||
|
||||
fn is_image_name(name: &str) -> bool {
|
||||
// Skip macOS metadata entries (__MACOSX/ prefix or AppleDouble ._* files)
|
||||
if name.starts_with("__macosx/") || name.contains("/._") || name.starts_with("._") {
|
||||
return false;
|
||||
}
|
||||
name.ends_with(".jpg")
|
||||
|| name.ends_with(".jpeg")
|
||||
|| name.ends_with(".png")
|
||||
@@ -282,7 +382,6 @@ fn extract_cbr_first_page(path: &Path) -> Result<Vec<u8>> {
|
||||
let tmp_dir = std::env::temp_dir().join(format!("stripstream-cbr-thumb-{}", Uuid::new_v4()));
|
||||
std::fs::create_dir_all(&tmp_dir).context("cannot create temp dir")?;
|
||||
|
||||
// Use env command like the API does
|
||||
let output = std::process::Command::new("env")
|
||||
.args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"])
|
||||
.arg(&tmp_dir)
|
||||
@@ -298,7 +397,6 @@ fn extract_cbr_first_page(path: &Path) -> Result<Vec<u8>> {
|
||||
));
|
||||
}
|
||||
|
||||
// Use WalkDir for recursive search (CBR can have subdirectories)
|
||||
let mut image_files: Vec<_> = WalkDir::new(&tmp_dir)
|
||||
.into_iter()
|
||||
.filter_map(|e| e.ok())
|
||||
@@ -346,3 +444,33 @@ fn extract_pdf_first_page(path: &Path) -> Result<Vec<u8>> {
|
||||
let _ = std::fs::remove_dir_all(&tmp_dir);
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn clean_title(filename: &str) -> String {
|
||||
let cleaned = regex::Regex::new(r"(?i)\s*T\d+\s*")
|
||||
.ok()
|
||||
.map(|re| re.replace_all(filename, " ").to_string())
|
||||
.unwrap_or_else(|| filename.to_string());
|
||||
|
||||
let cleaned = regex::Regex::new(r"(?i)\s*Vol\.?\s*\d+\s*")
|
||||
.ok()
|
||||
.map(|re| re.replace_all(&cleaned, " ").to_string())
|
||||
.unwrap_or(cleaned);
|
||||
|
||||
let cleaned = regex::Regex::new(r"(?i)\s*Volume\s*\d+\s*")
|
||||
.ok()
|
||||
.map(|re| re.replace_all(&cleaned, " ").to_string())
|
||||
.unwrap_or(cleaned);
|
||||
|
||||
let cleaned = regex::Regex::new(r"#\d+")
|
||||
.ok()
|
||||
.map(|re| re.replace_all(&cleaned, " ").to_string())
|
||||
.unwrap_or(cleaned);
|
||||
|
||||
let cleaned = regex::Regex::new(r"-\s*\d+\s*$")
|
||||
.ok()
|
||||
.map(|re| re.replace_all(&cleaned, " ").to_string())
|
||||
.unwrap_or(cleaned);
|
||||
|
||||
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user