Phase 1 (discovery): walkdir + filename-only metadata, zero archive I/O. Books are visible immediately in the UI while Phase 2 runs in background. Phase 2 (analysis): open each archive once via analyze_book() to extract page_count and first page bytes, then generate WebP thumbnail directly in the indexer — removing the HTTP roundtrip to the API checkup endpoint. - Add parse_metadata_fast() (infallible, no archive I/O) - Add analyze_book() returning (page_count, first_page_bytes) in one pass - Add looks_like_image() magic bytes check for unrar p stdout validation - Add lsar fallback in list_cbr_images() for UTF-16BE encoded filenames - Add directory_mtimes table to skip unchanged dirs on incremental scans - Add analyzer.rs: generate_thumbnail, analyze_library_books, regenerate_thumbnails - Remove run_checkup() from API; indexer handles thumbnail jobs directly - Remove api_base_url/api_bootstrap_token from IndexerConfig and AppState - Add unar + poppler-utils to indexer Dockerfile - Fix smoke.sh: wait for job completion, check thumbnail_url field Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
477 lines
14 KiB
Rust
477 lines
14 KiB
Rust
use anyhow::{Context, Result};
|
|
use std::io::Read;
|
|
use std::path::Path;
|
|
use std::process::Command;
|
|
use std::sync::OnceLock;
|
|
use uuid::Uuid;
|
|
use walkdir::WalkDir;
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum BookFormat {
|
|
Cbz,
|
|
Cbr,
|
|
Pdf,
|
|
}
|
|
|
|
impl BookFormat {
|
|
pub fn as_str(self) -> &'static str {
|
|
match self {
|
|
Self::Cbz => "cbz",
|
|
Self::Cbr => "cbr",
|
|
Self::Pdf => "pdf",
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct ParsedMetadata {
|
|
pub title: String,
|
|
pub series: Option<String>,
|
|
pub volume: Option<i32>,
|
|
pub page_count: Option<i32>,
|
|
}
|
|
|
|
pub fn detect_format(path: &Path) -> Option<BookFormat> {
|
|
let ext = path.extension()?.to_string_lossy().to_ascii_lowercase();
|
|
match ext.as_str() {
|
|
"cbz" => Some(BookFormat::Cbz),
|
|
"cbr" => Some(BookFormat::Cbr),
|
|
"pdf" => Some(BookFormat::Pdf),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
// Cache compiled regex patterns — compiled once on first use
|
|
static VOLUME_PATTERNS: OnceLock<Vec<(regex::Regex, usize)>> = OnceLock::new();
|
|
|
|
fn get_volume_patterns() -> &'static Vec<(regex::Regex, usize)> {
|
|
VOLUME_PATTERNS.get_or_init(|| {
|
|
[
|
|
// T01, T02 pattern (most common for manga/comics)
|
|
(r"(?i)T(\d+)", 1usize),
|
|
// Vol 1, Vol. 1, Volume 1
|
|
(r"(?i)Vol\.?\s*(\d+)", 1),
|
|
(r"(?i)Volume\s*(\d+)", 1),
|
|
// #1, #01
|
|
(r"#(\d+)", 1),
|
|
// - 1, - 01 at the end
|
|
(r"-\s*(\d+)\s*$", 1),
|
|
]
|
|
.iter()
|
|
.filter_map(|(pattern, group)| {
|
|
regex::Regex::new(pattern).ok().map(|re| (re, *group))
|
|
})
|
|
.collect()
|
|
})
|
|
}
|
|
|
|
fn extract_volume(filename: &str) -> Option<i32> {
|
|
for (re, group) in get_volume_patterns() {
|
|
if let Some(caps) = re.captures(filename) {
|
|
if let Some(mat) = caps.get(*group) {
|
|
return mat.as_str().parse::<i32>().ok();
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
fn extract_series(path: &Path, library_root: &Path) -> Option<String> {
|
|
path.parent().and_then(|parent| {
|
|
let parent_str = parent.to_string_lossy().to_string();
|
|
let root_str = library_root.to_string_lossy().to_string();
|
|
|
|
let relative = if let Some(idx) = parent_str.find(&root_str) {
|
|
let after_root = &parent_str[idx + root_str.len()..];
|
|
Path::new(after_root)
|
|
} else if let Ok(relative) = parent.strip_prefix(library_root) {
|
|
relative
|
|
} else {
|
|
eprintln!(
|
|
"[PARSER] Cannot determine series: parent '{}' doesn't start with root '{}'",
|
|
parent.display(),
|
|
library_root.display()
|
|
);
|
|
return None;
|
|
};
|
|
|
|
let relative_str = relative.to_string_lossy().to_string();
|
|
let relative_clean = relative_str.trim_start_matches(['/', '\\']);
|
|
|
|
if relative_clean.is_empty() {
|
|
return None;
|
|
}
|
|
|
|
let first_sep = relative_clean.find(['/', '\\']);
|
|
let series_name = match first_sep {
|
|
Some(idx) => &relative_clean[..idx],
|
|
None => relative_clean,
|
|
};
|
|
|
|
if series_name.is_empty() {
|
|
None
|
|
} else {
|
|
Some(series_name.to_string())
|
|
}
|
|
})
|
|
}
|
|
|
|
/// Fast metadata extraction from filename only — no archive I/O. Always succeeds.
|
|
pub fn parse_metadata_fast(path: &Path, _format: BookFormat, library_root: &Path) -> ParsedMetadata {
|
|
let filename = path
|
|
.file_stem()
|
|
.map(|s| s.to_string_lossy().to_string())
|
|
.unwrap_or_else(|| "Untitled".to_string());
|
|
|
|
let volume = extract_volume(&filename);
|
|
let title = filename;
|
|
let series = extract_series(path, library_root);
|
|
|
|
ParsedMetadata {
|
|
title,
|
|
series,
|
|
volume,
|
|
page_count: None,
|
|
}
|
|
}
|
|
|
|
pub fn parse_metadata(
|
|
path: &Path,
|
|
format: BookFormat,
|
|
library_root: &Path,
|
|
) -> Result<ParsedMetadata> {
|
|
let mut meta = parse_metadata_fast(path, format, library_root);
|
|
|
|
meta.page_count = match format {
|
|
BookFormat::Cbz => parse_cbz_page_count(path).ok(),
|
|
BookFormat::Cbr => parse_cbr_page_count(path).ok(),
|
|
BookFormat::Pdf => parse_pdf_page_count(path).ok(),
|
|
};
|
|
|
|
Ok(meta)
|
|
}
|
|
|
|
/// Open an archive once and return (page_count, first_page_bytes).
|
|
/// This is more efficient than calling parse_metadata + extract_first_page separately.
|
|
pub fn analyze_book(path: &Path, format: BookFormat) -> Result<(i32, Vec<u8>)> {
|
|
match format {
|
|
BookFormat::Cbz => analyze_cbz(path),
|
|
BookFormat::Cbr => analyze_cbr(path),
|
|
BookFormat::Pdf => analyze_pdf(path),
|
|
}
|
|
}
|
|
|
|
fn analyze_cbz(path: &Path) -> Result<(i32, Vec<u8>)> {
|
|
let file = std::fs::File::open(path)
|
|
.with_context(|| format!("cannot open cbz: {}", path.display()))?;
|
|
let mut archive = zip::ZipArchive::new(file).context("invalid cbz archive")?;
|
|
|
|
let mut image_names: Vec<String> = Vec::new();
|
|
for i in 0..archive.len() {
|
|
let entry = archive.by_index(i).context("cannot read cbz entry")?;
|
|
let name = entry.name().to_ascii_lowercase();
|
|
if is_image_name(&name) {
|
|
image_names.push(entry.name().to_string());
|
|
}
|
|
}
|
|
image_names.sort();
|
|
|
|
let count = image_names.len() as i32;
|
|
let first_image = image_names.first().context("no images found in cbz")?;
|
|
|
|
let mut entry = archive
|
|
.by_name(first_image)
|
|
.context("cannot read first image")?;
|
|
let mut buf = Vec::new();
|
|
entry.read_to_end(&mut buf)?;
|
|
|
|
Ok((count, buf))
|
|
}
|
|
|
|
fn list_cbr_images(path: &Path) -> Result<Vec<String>> {
|
|
// Try unrar lb first (fast)
|
|
let output = std::process::Command::new("unrar")
|
|
.arg("lb")
|
|
.arg(path)
|
|
.output()
|
|
.with_context(|| format!("failed to execute unrar lb for {}", path.display()))?;
|
|
|
|
if output.status.success() {
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
let images: Vec<String> = stdout
|
|
.lines()
|
|
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
|
|
.map(|l| l.to_string())
|
|
.collect();
|
|
if !images.is_empty() {
|
|
return Ok(images);
|
|
}
|
|
}
|
|
|
|
// Fallback: lsar (from unar package) handles UTF-16BE encoded filenames
|
|
let lsar_output = std::process::Command::new("lsar")
|
|
.arg(path)
|
|
.output()
|
|
.with_context(|| format!("failed to execute lsar for {}", path.display()))?;
|
|
|
|
if !lsar_output.status.success() {
|
|
return Err(anyhow::anyhow!(
|
|
"both unrar lb and lsar failed for {}",
|
|
path.display()
|
|
));
|
|
}
|
|
|
|
let stdout = String::from_utf8_lossy(&lsar_output.stdout);
|
|
// lsar output: first line is archive info, then one file per line (indented)
|
|
let images: Vec<String> = stdout
|
|
.lines()
|
|
.skip(1) // skip the archive header line
|
|
.map(|l| l.trim().to_string())
|
|
.filter(|line| is_image_name(&line.to_ascii_lowercase()))
|
|
.collect();
|
|
|
|
Ok(images)
|
|
}
|
|
|
|
fn analyze_cbr(path: &Path) -> Result<(i32, Vec<u8>)> {
|
|
let mut image_names = list_cbr_images(path)?;
|
|
image_names.sort();
|
|
|
|
let count = image_names.len() as i32;
|
|
if count == 0 {
|
|
return Err(anyhow::anyhow!("no images found in cbr: {}", path.display()));
|
|
}
|
|
|
|
let first_name = &image_names[0];
|
|
|
|
// Try unrar p to extract first image to stdout (faster — no temp dir)
|
|
let p_output = std::process::Command::new("unrar")
|
|
.args(["p", "-inul"])
|
|
.arg(path)
|
|
.arg(first_name)
|
|
.output();
|
|
|
|
match p_output {
|
|
Ok(out) if out.status.success() && looks_like_image(&out.stdout) => Ok((count, out.stdout)),
|
|
_ => {
|
|
// Fallback: full extraction with unar (handles special chars, encoding issues)
|
|
let image_bytes = extract_cbr_first_page(path)?;
|
|
Ok((count, image_bytes))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Check image magic bytes to validate that bytes are a real image before decoding.
|
|
fn looks_like_image(bytes: &[u8]) -> bool {
|
|
if bytes.len() < 12 {
|
|
return false;
|
|
}
|
|
// JPEG: FF D8 FF
|
|
if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
return true;
|
|
}
|
|
// PNG: 89 50 4E 47 0D 0A 1A 0A
|
|
if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
|
|
return true;
|
|
}
|
|
// WebP: RIFF....WEBP
|
|
if &bytes[0..4] == b"RIFF" && &bytes[8..12] == b"WEBP" {
|
|
return true;
|
|
}
|
|
false
|
|
}
|
|
|
|
fn analyze_pdf(path: &Path) -> Result<(i32, Vec<u8>)> {
|
|
let count = parse_pdf_page_count(path)?;
|
|
let image_bytes = extract_pdf_first_page(path)?;
|
|
Ok((count, image_bytes))
|
|
}
|
|
|
|
fn parse_cbz_page_count(path: &Path) -> Result<i32> {
|
|
let file = std::fs::File::open(path)
|
|
.with_context(|| format!("cannot open cbz: {}", path.display()))?;
|
|
let mut archive = zip::ZipArchive::new(file).context("invalid cbz archive")?;
|
|
let mut count: i32 = 0;
|
|
for i in 0..archive.len() {
|
|
let entry = archive.by_index(i).context("cannot read cbz entry")?;
|
|
let name = entry.name().to_ascii_lowercase();
|
|
if is_image_name(&name) {
|
|
count += 1;
|
|
}
|
|
}
|
|
Ok(count)
|
|
}
|
|
|
|
fn parse_cbr_page_count(path: &Path) -> Result<i32> {
|
|
let images = list_cbr_images(path)?;
|
|
Ok(images.len() as i32)
|
|
}
|
|
|
|
fn parse_pdf_page_count(path: &Path) -> Result<i32> {
|
|
let output = std::process::Command::new("pdfinfo")
|
|
.arg(path)
|
|
.output()
|
|
.with_context(|| format!("failed to execute pdfinfo for {}", path.display()))?;
|
|
|
|
if !output.status.success() {
|
|
return Err(anyhow::anyhow!("pdfinfo failed for {}", path.display()));
|
|
}
|
|
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
for line in stdout.lines() {
|
|
if line.starts_with("Pages:") {
|
|
if let Some(pages_str) = line.split_whitespace().nth(1) {
|
|
return pages_str
|
|
.parse::<i32>()
|
|
.with_context(|| format!("cannot parse page count: {}", pages_str));
|
|
}
|
|
}
|
|
}
|
|
|
|
Err(anyhow::anyhow!(
|
|
"could not find page count in pdfinfo output"
|
|
))
|
|
}
|
|
|
|
fn is_image_name(name: &str) -> bool {
|
|
// Skip macOS metadata entries (__MACOSX/ prefix or AppleDouble ._* files)
|
|
if name.starts_with("__macosx/") || name.contains("/._") || name.starts_with("._") {
|
|
return false;
|
|
}
|
|
name.ends_with(".jpg")
|
|
|| name.ends_with(".jpeg")
|
|
|| name.ends_with(".png")
|
|
|| name.ends_with(".webp")
|
|
|| name.ends_with(".avif")
|
|
}
|
|
|
|
pub fn extract_first_page(path: &Path, format: BookFormat) -> Result<Vec<u8>> {
|
|
match format {
|
|
BookFormat::Cbz => extract_cbz_first_page(path),
|
|
BookFormat::Cbr => extract_cbr_first_page(path),
|
|
BookFormat::Pdf => extract_pdf_first_page(path),
|
|
}
|
|
}
|
|
|
|
fn extract_cbz_first_page(path: &Path) -> Result<Vec<u8>> {
|
|
let file = std::fs::File::open(path)
|
|
.with_context(|| format!("cannot open cbz: {}", path.display()))?;
|
|
let mut archive = zip::ZipArchive::new(file).context("invalid cbz archive")?;
|
|
|
|
let mut image_names: Vec<String> = Vec::new();
|
|
for i in 0..archive.len() {
|
|
let entry = archive.by_index(i).context("cannot read cbz entry")?;
|
|
let name = entry.name().to_ascii_lowercase();
|
|
if is_image_name(&name) {
|
|
image_names.push(entry.name().to_string());
|
|
}
|
|
}
|
|
image_names.sort();
|
|
|
|
let first_image = image_names.first().context("no images found in cbz")?;
|
|
|
|
let mut entry = archive
|
|
.by_name(first_image)
|
|
.context("cannot read first image")?;
|
|
let mut buf = Vec::new();
|
|
entry.read_to_end(&mut buf)?;
|
|
Ok(buf)
|
|
}
|
|
|
|
fn extract_cbr_first_page(path: &Path) -> Result<Vec<u8>> {
|
|
let tmp_dir = std::env::temp_dir().join(format!("stripstream-cbr-thumb-{}", Uuid::new_v4()));
|
|
std::fs::create_dir_all(&tmp_dir).context("cannot create temp dir")?;
|
|
|
|
let output = std::process::Command::new("env")
|
|
.args(["LC_ALL=en_US.UTF-8", "LANG=en_US.UTF-8", "unar", "-o"])
|
|
.arg(&tmp_dir)
|
|
.arg(path)
|
|
.output()
|
|
.context("unar failed")?;
|
|
|
|
if !output.status.success() {
|
|
let _ = std::fs::remove_dir_all(&tmp_dir);
|
|
return Err(anyhow::anyhow!(
|
|
"unar extract failed: {:?}",
|
|
String::from_utf8_lossy(&output.stderr)
|
|
));
|
|
}
|
|
|
|
let mut image_files: Vec<_> = WalkDir::new(&tmp_dir)
|
|
.into_iter()
|
|
.filter_map(|e| e.ok())
|
|
.filter(|e| {
|
|
let name = e.file_name().to_string_lossy().to_lowercase();
|
|
is_image_name(&name)
|
|
})
|
|
.collect();
|
|
|
|
image_files.sort_by_key(|e| e.path().to_string_lossy().to_lowercase());
|
|
|
|
let first_image = image_files.first().context("no images found in cbr")?;
|
|
|
|
let data = std::fs::read(first_image.path())?;
|
|
let _ = std::fs::remove_dir_all(&tmp_dir);
|
|
Ok(data)
|
|
}
|
|
|
|
fn extract_pdf_first_page(path: &Path) -> Result<Vec<u8>> {
|
|
let tmp_dir = std::env::temp_dir().join(format!("stripstream-pdf-thumb-{}", Uuid::new_v4()));
|
|
std::fs::create_dir_all(&tmp_dir)?;
|
|
let output_prefix = tmp_dir.join("page");
|
|
|
|
let output = Command::new("pdftoppm")
|
|
.args([
|
|
"-f",
|
|
"1",
|
|
"-singlefile",
|
|
"-png",
|
|
"-scale-to",
|
|
"800",
|
|
path.to_str().unwrap(),
|
|
output_prefix.to_str().unwrap(),
|
|
])
|
|
.output()
|
|
.context("pdftoppm failed")?;
|
|
|
|
if !output.status.success() {
|
|
let _ = std::fs::remove_dir_all(&tmp_dir);
|
|
return Err(anyhow::anyhow!("pdftoppm failed"));
|
|
}
|
|
|
|
let image_path = output_prefix.with_extension("png");
|
|
let data = std::fs::read(&image_path)?;
|
|
let _ = std::fs::remove_dir_all(&tmp_dir);
|
|
Ok(data)
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
fn clean_title(filename: &str) -> String {
|
|
let cleaned = regex::Regex::new(r"(?i)\s*T\d+\s*")
|
|
.ok()
|
|
.map(|re| re.replace_all(filename, " ").to_string())
|
|
.unwrap_or_else(|| filename.to_string());
|
|
|
|
let cleaned = regex::Regex::new(r"(?i)\s*Vol\.?\s*\d+\s*")
|
|
.ok()
|
|
.map(|re| re.replace_all(&cleaned, " ").to_string())
|
|
.unwrap_or(cleaned);
|
|
|
|
let cleaned = regex::Regex::new(r"(?i)\s*Volume\s*\d+\s*")
|
|
.ok()
|
|
.map(|re| re.replace_all(&cleaned, " ").to_string())
|
|
.unwrap_or(cleaned);
|
|
|
|
let cleaned = regex::Regex::new(r"#\d+")
|
|
.ok()
|
|
.map(|re| re.replace_all(&cleaned, " ").to_string())
|
|
.unwrap_or(cleaned);
|
|
|
|
let cleaned = regex::Regex::new(r"-\s*\d+\s*$")
|
|
.ok()
|
|
.map(|re| re.replace_all(&cleaned, " ").to_string())
|
|
.unwrap_or(cleaned);
|
|
|
|
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
|
|
}
|