|
|
|
|
@@ -2,7 +2,7 @@ use anyhow::Context;
|
|
|
|
|
use axum::{extract::State, routing::get, Json, Router};
|
|
|
|
|
use chrono::{DateTime, Utc};
|
|
|
|
|
use axum::http::StatusCode;
|
|
|
|
|
use notify::{Config, Event, RecommendedWatcher, RecursiveMode, Watcher};
|
|
|
|
|
use notify::{Event, RecommendedWatcher, RecursiveMode, Watcher};
|
|
|
|
|
use parsers::{detect_format, parse_metadata, BookFormat};
|
|
|
|
|
use serde::Serialize;
|
|
|
|
|
use sha2::{Digest, Sha256};
|
|
|
|
|
@@ -57,7 +57,7 @@ async fn main() -> anyhow::Result<()> {
|
|
|
|
|
|
|
|
|
|
let config = IndexerConfig::from_env()?;
|
|
|
|
|
let pool = PgPoolOptions::new()
|
|
|
|
|
.max_connections(5)
|
|
|
|
|
.max_connections(20)
|
|
|
|
|
.connect(&config.database_url)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
@@ -92,8 +92,41 @@ async fn ready(State(state): State<AppState>) -> Result<Json<serde_json::Value>,
|
|
|
|
|
Ok(Json(serde_json::json!({"status": "ready"})))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn cleanup_stale_jobs(pool: &sqlx::PgPool) -> anyhow::Result<()> {
|
|
|
|
|
// Mark jobs that have been running for more than 5 minutes as failed
|
|
|
|
|
// This handles cases where the indexer was restarted while jobs were running
|
|
|
|
|
let result = sqlx::query(
|
|
|
|
|
r#"
|
|
|
|
|
UPDATE index_jobs
|
|
|
|
|
SET status = 'failed',
|
|
|
|
|
finished_at = NOW(),
|
|
|
|
|
error_opt = 'Job interrupted by indexer restart'
|
|
|
|
|
WHERE status = 'running'
|
|
|
|
|
AND started_at < NOW() - INTERVAL '5 minutes'
|
|
|
|
|
RETURNING id
|
|
|
|
|
"#
|
|
|
|
|
)
|
|
|
|
|
.fetch_all(pool)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
if !result.is_empty() {
|
|
|
|
|
let count = result.len();
|
|
|
|
|
let ids: Vec<String> = result.iter()
|
|
|
|
|
.map(|row| row.get::<Uuid, _>("id").to_string())
|
|
|
|
|
.collect();
|
|
|
|
|
info!("[CLEANUP] Marked {} stale job(s) as failed: {}", count, ids.join(", "));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn run_worker(state: AppState, interval_seconds: u64) {
|
|
|
|
|
let wait = Duration::from_secs(interval_seconds.max(1));
|
|
|
|
|
|
|
|
|
|
// Cleanup stale jobs from previous runs
|
|
|
|
|
if let Err(err) = cleanup_stale_jobs(&state.pool).await {
|
|
|
|
|
error!("[CLEANUP] Failed to cleanup stale jobs: {}", err);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Start file watcher task
|
|
|
|
|
let watcher_state = state.clone();
|
|
|
|
|
@@ -121,8 +154,14 @@ async fn run_worker(state: AppState, interval_seconds: u64) {
|
|
|
|
|
Ok(Some((job_id, library_id))) => {
|
|
|
|
|
info!("[INDEXER] Starting job {} library={:?}", job_id, library_id);
|
|
|
|
|
if let Err(err) = process_job(&state, job_id, library_id).await {
|
|
|
|
|
error!("[INDEXER] Job {} failed: {}", job_id, err);
|
|
|
|
|
let _ = fail_job(&state.pool, job_id, &err.to_string()).await;
|
|
|
|
|
let err_str = err.to_string();
|
|
|
|
|
if err_str.contains("cancelled") || err_str.contains("Cancelled") {
|
|
|
|
|
info!("[INDEXER] Job {} was cancelled by user", job_id);
|
|
|
|
|
// Status is already 'cancelled' in DB, don't change it
|
|
|
|
|
} else {
|
|
|
|
|
error!("[INDEXER] Job {} failed: {}", job_id, err);
|
|
|
|
|
let _ = fail_job(&state.pool, job_id, &err_str).await;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
info!("[INDEXER] Job {} completed", job_id);
|
|
|
|
|
}
|
|
|
|
|
@@ -432,6 +471,8 @@ async fn process_job(state: &AppState, job_id: Uuid, target_library_id: Option<U
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!("[JOB] Found {} libraries, {} total files to index", libraries.len(), total_files);
|
|
|
|
|
|
|
|
|
|
// Update job with total estimate
|
|
|
|
|
sqlx::query("UPDATE index_jobs SET total_files = $2 WHERE id = $1")
|
|
|
|
|
.bind(job_id)
|
|
|
|
|
@@ -483,6 +524,244 @@ async fn fail_job(pool: &sqlx::PgPool, job_id: Uuid, error_message: &str) -> any
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Batched update data structures
|
|
|
|
|
struct BookUpdate {
|
|
|
|
|
book_id: Uuid,
|
|
|
|
|
title: String,
|
|
|
|
|
kind: String,
|
|
|
|
|
series: Option<String>,
|
|
|
|
|
volume: Option<i32>,
|
|
|
|
|
page_count: Option<i32>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct FileUpdate {
|
|
|
|
|
file_id: Uuid,
|
|
|
|
|
format: String,
|
|
|
|
|
size_bytes: i64,
|
|
|
|
|
mtime: DateTime<Utc>,
|
|
|
|
|
fingerprint: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct BookInsert {
|
|
|
|
|
book_id: Uuid,
|
|
|
|
|
library_id: Uuid,
|
|
|
|
|
kind: String,
|
|
|
|
|
title: String,
|
|
|
|
|
series: Option<String>,
|
|
|
|
|
volume: Option<i32>,
|
|
|
|
|
page_count: Option<i32>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct FileInsert {
|
|
|
|
|
file_id: Uuid,
|
|
|
|
|
book_id: Uuid,
|
|
|
|
|
format: String,
|
|
|
|
|
abs_path: String,
|
|
|
|
|
size_bytes: i64,
|
|
|
|
|
mtime: DateTime<Utc>,
|
|
|
|
|
fingerprint: String,
|
|
|
|
|
parse_status: String,
|
|
|
|
|
parse_error: Option<String>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ErrorInsert {
|
|
|
|
|
job_id: Uuid,
|
|
|
|
|
file_path: String,
|
|
|
|
|
error_message: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn flush_all_batches(
|
|
|
|
|
pool: &sqlx::PgPool,
|
|
|
|
|
books_update: &mut Vec<BookUpdate>,
|
|
|
|
|
files_update: &mut Vec<FileUpdate>,
|
|
|
|
|
books_insert: &mut Vec<BookInsert>,
|
|
|
|
|
files_insert: &mut Vec<FileInsert>,
|
|
|
|
|
errors_insert: &mut Vec<ErrorInsert>,
|
|
|
|
|
) -> anyhow::Result<()> {
|
|
|
|
|
if books_update.is_empty() && files_update.is_empty() && books_insert.is_empty() && files_insert.is_empty() && errors_insert.is_empty() {
|
|
|
|
|
return Ok(());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let start = std::time::Instant::now();
|
|
|
|
|
let mut tx = pool.begin().await?;
|
|
|
|
|
|
|
|
|
|
// Batch update books using UNNEST
|
|
|
|
|
if !books_update.is_empty() {
|
|
|
|
|
let book_ids: Vec<Uuid> = books_update.iter().map(|b| b.book_id).collect();
|
|
|
|
|
let titles: Vec<String> = books_update.iter().map(|b| b.title.clone()).collect();
|
|
|
|
|
let kinds: Vec<String> = books_update.iter().map(|b| b.kind.clone()).collect();
|
|
|
|
|
let series: Vec<Option<String>> = books_update.iter().map(|b| b.series.clone()).collect();
|
|
|
|
|
let volumes: Vec<Option<i32>> = books_update.iter().map(|b| b.volume).collect();
|
|
|
|
|
let page_counts: Vec<Option<i32>> = books_update.iter().map(|b| b.page_count).collect();
|
|
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
r#"
|
|
|
|
|
UPDATE books SET
|
|
|
|
|
title = data.title,
|
|
|
|
|
kind = data.kind,
|
|
|
|
|
series = data.series,
|
|
|
|
|
volume = data.volume,
|
|
|
|
|
page_count = data.page_count,
|
|
|
|
|
updated_at = NOW()
|
|
|
|
|
FROM (
|
|
|
|
|
SELECT * FROM UNNEST($1::uuid[], $2::text[], $3::text[], $4::text[], $5::int[], $6::int[])
|
|
|
|
|
AS t(book_id, title, kind, series, volume, page_count)
|
|
|
|
|
) AS data
|
|
|
|
|
WHERE books.id = data.book_id
|
|
|
|
|
"#
|
|
|
|
|
)
|
|
|
|
|
.bind(&book_ids)
|
|
|
|
|
.bind(&titles)
|
|
|
|
|
.bind(&kinds)
|
|
|
|
|
.bind(&series)
|
|
|
|
|
.bind(&volumes)
|
|
|
|
|
.bind(&page_counts)
|
|
|
|
|
.execute(&mut *tx)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
books_update.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Batch update files using UNNEST
|
|
|
|
|
if !files_update.is_empty() {
|
|
|
|
|
let file_ids: Vec<Uuid> = files_update.iter().map(|f| f.file_id).collect();
|
|
|
|
|
let formats: Vec<String> = files_update.iter().map(|f| f.format.clone()).collect();
|
|
|
|
|
let sizes: Vec<i64> = files_update.iter().map(|f| f.size_bytes).collect();
|
|
|
|
|
let mtimes: Vec<DateTime<Utc>> = files_update.iter().map(|f| f.mtime).collect();
|
|
|
|
|
let fingerprints: Vec<String> = files_update.iter().map(|f| f.fingerprint.clone()).collect();
|
|
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
r#"
|
|
|
|
|
UPDATE book_files SET
|
|
|
|
|
format = data.format,
|
|
|
|
|
size_bytes = data.size,
|
|
|
|
|
mtime = data.mtime,
|
|
|
|
|
fingerprint = data.fp,
|
|
|
|
|
parse_status = 'ok',
|
|
|
|
|
parse_error_opt = NULL,
|
|
|
|
|
updated_at = NOW()
|
|
|
|
|
FROM (
|
|
|
|
|
SELECT * FROM UNNEST($1::uuid[], $2::text[], $3::bigint[], $4::timestamptz[], $5::text[])
|
|
|
|
|
AS t(file_id, format, size, mtime, fp)
|
|
|
|
|
) AS data
|
|
|
|
|
WHERE book_files.id = data.file_id
|
|
|
|
|
"#
|
|
|
|
|
)
|
|
|
|
|
.bind(&file_ids)
|
|
|
|
|
.bind(&formats)
|
|
|
|
|
.bind(&sizes)
|
|
|
|
|
.bind(&mtimes)
|
|
|
|
|
.bind(&fingerprints)
|
|
|
|
|
.execute(&mut *tx)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
files_update.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Batch insert books using UNNEST
|
|
|
|
|
if !books_insert.is_empty() {
|
|
|
|
|
let book_ids: Vec<Uuid> = books_insert.iter().map(|b| b.book_id).collect();
|
|
|
|
|
let library_ids: Vec<Uuid> = books_insert.iter().map(|b| b.library_id).collect();
|
|
|
|
|
let kinds: Vec<String> = books_insert.iter().map(|b| b.kind.clone()).collect();
|
|
|
|
|
let titles: Vec<String> = books_insert.iter().map(|b| b.title.clone()).collect();
|
|
|
|
|
let series: Vec<Option<String>> = books_insert.iter().map(|b| b.series.clone()).collect();
|
|
|
|
|
let volumes: Vec<Option<i32>> = books_insert.iter().map(|b| b.volume).collect();
|
|
|
|
|
let page_counts: Vec<Option<i32>> = books_insert.iter().map(|b| b.page_count).collect();
|
|
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
r#"
|
|
|
|
|
INSERT INTO books (id, library_id, kind, title, series, volume, page_count)
|
|
|
|
|
SELECT * FROM UNNEST($1::uuid[], $2::uuid[], $3::text[], $4::text[], $5::text[], $6::int[], $7::int[])
|
|
|
|
|
AS t(id, library_id, kind, title, series, volume, page_count)
|
|
|
|
|
"#
|
|
|
|
|
)
|
|
|
|
|
.bind(&book_ids)
|
|
|
|
|
.bind(&library_ids)
|
|
|
|
|
.bind(&kinds)
|
|
|
|
|
.bind(&titles)
|
|
|
|
|
.bind(&series)
|
|
|
|
|
.bind(&volumes)
|
|
|
|
|
.bind(&page_counts)
|
|
|
|
|
.execute(&mut *tx)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
books_insert.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Batch insert files using UNNEST
|
|
|
|
|
if !files_insert.is_empty() {
|
|
|
|
|
let file_ids: Vec<Uuid> = files_insert.iter().map(|f| f.file_id).collect();
|
|
|
|
|
let book_ids: Vec<Uuid> = files_insert.iter().map(|f| f.book_id).collect();
|
|
|
|
|
let formats: Vec<String> = files_insert.iter().map(|f| f.format.clone()).collect();
|
|
|
|
|
let abs_paths: Vec<String> = files_insert.iter().map(|f| f.abs_path.clone()).collect();
|
|
|
|
|
let sizes: Vec<i64> = files_insert.iter().map(|f| f.size_bytes).collect();
|
|
|
|
|
let mtimes: Vec<DateTime<Utc>> = files_insert.iter().map(|f| f.mtime).collect();
|
|
|
|
|
let fingerprints: Vec<String> = files_insert.iter().map(|f| f.fingerprint.clone()).collect();
|
|
|
|
|
let statuses: Vec<String> = files_insert.iter().map(|f| f.parse_status.clone()).collect();
|
|
|
|
|
let errors: Vec<Option<String>> = files_insert.iter().map(|f| f.parse_error.clone()).collect();
|
|
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
r#"
|
|
|
|
|
INSERT INTO book_files (id, book_id, format, abs_path, size_bytes, mtime, fingerprint, parse_status, parse_error_opt)
|
|
|
|
|
SELECT * FROM UNNEST($1::uuid[], $2::uuid[], $3::text[], $4::text[], $5::bigint[], $6::timestamptz[], $7::text[], $8::text[], $9::text[])
|
|
|
|
|
AS t(id, book_id, format, abs_path, size_bytes, mtime, fingerprint, parse_status, parse_error_opt)
|
|
|
|
|
"#
|
|
|
|
|
)
|
|
|
|
|
.bind(&file_ids)
|
|
|
|
|
.bind(&book_ids)
|
|
|
|
|
.bind(&formats)
|
|
|
|
|
.bind(&abs_paths)
|
|
|
|
|
.bind(&sizes)
|
|
|
|
|
.bind(&mtimes)
|
|
|
|
|
.bind(&fingerprints)
|
|
|
|
|
.bind(&statuses)
|
|
|
|
|
.bind(&errors)
|
|
|
|
|
.execute(&mut *tx)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
files_insert.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Batch insert errors using UNNEST
|
|
|
|
|
if !errors_insert.is_empty() {
|
|
|
|
|
let job_ids: Vec<Uuid> = errors_insert.iter().map(|e| e.job_id).collect();
|
|
|
|
|
let file_paths: Vec<String> = errors_insert.iter().map(|e| e.file_path.clone()).collect();
|
|
|
|
|
let messages: Vec<String> = errors_insert.iter().map(|e| e.error_message.clone()).collect();
|
|
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
r#"
|
|
|
|
|
INSERT INTO index_job_errors (job_id, file_path, error_message)
|
|
|
|
|
SELECT * FROM UNNEST($1::uuid[], $2::text[], $3::text[])
|
|
|
|
|
AS t(job_id, file_path, error_message)
|
|
|
|
|
"#
|
|
|
|
|
)
|
|
|
|
|
.bind(&job_ids)
|
|
|
|
|
.bind(&file_paths)
|
|
|
|
|
.bind(&messages)
|
|
|
|
|
.execute(&mut *tx)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
errors_insert.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tx.commit().await?;
|
|
|
|
|
info!("[BATCH] Flushed all batches in {:?}", start.elapsed());
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check if job has been cancelled
|
|
|
|
|
async fn is_job_cancelled(pool: &sqlx::PgPool, job_id: Uuid) -> anyhow::Result<bool> {
|
|
|
|
|
let status: Option<String> = sqlx::query_scalar(
|
|
|
|
|
"SELECT status FROM index_jobs WHERE id = $1"
|
|
|
|
|
)
|
|
|
|
|
.bind(job_id)
|
|
|
|
|
.fetch_optional(pool)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
Ok(status.as_deref() == Some("cancelled"))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn scan_library(
|
|
|
|
|
state: &AppState,
|
|
|
|
|
job_id: Uuid,
|
|
|
|
|
@@ -493,6 +772,8 @@ async fn scan_library(
|
|
|
|
|
total_files: usize,
|
|
|
|
|
is_full_rebuild: bool,
|
|
|
|
|
) -> anyhow::Result<()> {
|
|
|
|
|
info!("[SCAN] Starting scan of library {} at path: {} (full_rebuild={})", library_id, root.display(), is_full_rebuild);
|
|
|
|
|
|
|
|
|
|
let existing_rows = sqlx::query(
|
|
|
|
|
r#"
|
|
|
|
|
SELECT bf.id AS file_id, bf.book_id, bf.abs_path, bf.fingerprint
|
|
|
|
|
@@ -506,21 +787,31 @@ async fn scan_library(
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
let mut existing: HashMap<String, (Uuid, Uuid, String)> = HashMap::new();
|
|
|
|
|
// For full rebuilds, don't use existing files - force reindex of everything
|
|
|
|
|
if !is_full_rebuild {
|
|
|
|
|
for row in existing_rows {
|
|
|
|
|
let abs_path: String = row.get("abs_path");
|
|
|
|
|
// Remap for local development to match scanned paths
|
|
|
|
|
let remapped_path = remap_libraries_path(&abs_path);
|
|
|
|
|
existing.insert(
|
|
|
|
|
remapped_path,
|
|
|
|
|
(row.get("file_id"), row.get("book_id"), row.get("fingerprint")),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
info!("[SCAN] Found {} existing files in database for library {}", existing.len(), library_id);
|
|
|
|
|
} else {
|
|
|
|
|
info!("[SCAN] Full rebuild: skipping existing files lookup (all will be treated as new)");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut seen: HashMap<String, bool> = HashMap::new();
|
|
|
|
|
let mut library_processed_count = 0i32;
|
|
|
|
|
let mut last_progress_update = std::time::Instant::now();
|
|
|
|
|
|
|
|
|
|
// Batching buffers
|
|
|
|
|
const BATCH_SIZE: usize = 100;
|
|
|
|
|
let mut books_to_update: Vec<BookUpdate> = Vec::with_capacity(BATCH_SIZE);
|
|
|
|
|
let mut files_to_update: Vec<FileUpdate> = Vec::with_capacity(BATCH_SIZE);
|
|
|
|
|
let mut books_to_insert: Vec<BookInsert> = Vec::with_capacity(BATCH_SIZE);
|
|
|
|
|
let mut files_to_insert: Vec<FileInsert> = Vec::with_capacity(BATCH_SIZE);
|
|
|
|
|
let mut errors_to_insert: Vec<ErrorInsert> = Vec::with_capacity(BATCH_SIZE);
|
|
|
|
|
|
|
|
|
|
for entry in WalkDir::new(root).into_iter().filter_map(Result::ok) {
|
|
|
|
|
if !entry.file_type().is_file() {
|
|
|
|
|
@@ -529,50 +820,59 @@ async fn scan_library(
|
|
|
|
|
|
|
|
|
|
let path = entry.path();
|
|
|
|
|
let Some(format) = detect_format(path) else {
|
|
|
|
|
trace!("[SCAN] Skipping non-book file: {}", path.display());
|
|
|
|
|
continue;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
info!("[SCAN] Found book file: {} (format: {:?})", path.display(), format);
|
|
|
|
|
stats.scanned_files += 1;
|
|
|
|
|
library_processed_count += 1;
|
|
|
|
|
*total_processed_count += 1;
|
|
|
|
|
let abs_path_local = path.to_string_lossy().to_string();
|
|
|
|
|
// Convert local path to /libraries format for DB storage
|
|
|
|
|
let abs_path = unmap_libraries_path(&abs_path_local);
|
|
|
|
|
let file_name = path.file_name()
|
|
|
|
|
.map(|s| s.to_string_lossy().to_string())
|
|
|
|
|
.unwrap_or_else(|| abs_path.clone());
|
|
|
|
|
|
|
|
|
|
info!("[SCAN] Job {} processing file {}/{} (library: {}): {}", job_id, total_processed_count, total_files, library_processed_count, file_name);
|
|
|
|
|
let start_time = std::time::Instant::now();
|
|
|
|
|
|
|
|
|
|
// Update progress in DB using the global processed count
|
|
|
|
|
let progress_percent = if total_files > 0 {
|
|
|
|
|
((*total_processed_count as f64 / total_files as f64) * 100.0) as i32
|
|
|
|
|
} else {
|
|
|
|
|
0
|
|
|
|
|
};
|
|
|
|
|
// Update progress in DB every 1 second or every 10 files
|
|
|
|
|
let should_update_progress = last_progress_update.elapsed() > Duration::from_secs(1) || library_processed_count % 10 == 0;
|
|
|
|
|
if should_update_progress {
|
|
|
|
|
let progress_percent = if total_files > 0 {
|
|
|
|
|
((*total_processed_count as f64 / total_files as f64) * 100.0) as i32
|
|
|
|
|
} else {
|
|
|
|
|
0
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let db_start = std::time::Instant::now();
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"UPDATE index_jobs SET current_file = $2, processed_files = $3, progress_percent = $4 WHERE id = $1"
|
|
|
|
|
)
|
|
|
|
|
.bind(job_id)
|
|
|
|
|
.bind(&file_name)
|
|
|
|
|
.bind(*total_processed_count)
|
|
|
|
|
.bind(progress_percent)
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| {
|
|
|
|
|
error!("[BDD] Failed to update progress for job {}: {}", job_id, e);
|
|
|
|
|
e
|
|
|
|
|
})?;
|
|
|
|
|
info!("[BDD] Progress update took {:?}", db_start.elapsed());
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"UPDATE index_jobs SET current_file = $2, processed_files = $3, progress_percent = $4 WHERE id = $1"
|
|
|
|
|
)
|
|
|
|
|
.bind(job_id)
|
|
|
|
|
.bind(&file_name)
|
|
|
|
|
.bind(*total_processed_count)
|
|
|
|
|
.bind(progress_percent)
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| {
|
|
|
|
|
error!("[BDD] Failed to update progress for job {}: {}", job_id, e);
|
|
|
|
|
e
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
last_progress_update = std::time::Instant::now();
|
|
|
|
|
|
|
|
|
|
// Check if job has been cancelled
|
|
|
|
|
if is_job_cancelled(&state.pool, job_id).await? {
|
|
|
|
|
info!("[JOB] Job {} cancelled by user, stopping...", job_id);
|
|
|
|
|
// Flush any pending batches before exiting
|
|
|
|
|
flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?;
|
|
|
|
|
return Err(anyhow::anyhow!("Job cancelled by user"));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Use local path for seen tracking to match existing keys
|
|
|
|
|
let seen_key = remap_libraries_path(&abs_path);
|
|
|
|
|
seen.insert(seen_key, true);
|
|
|
|
|
seen.insert(seen_key.clone(), true);
|
|
|
|
|
|
|
|
|
|
let meta_start = std::time::Instant::now();
|
|
|
|
|
let metadata = std::fs::metadata(path)
|
|
|
|
|
.with_context(|| format!("cannot stat {}", path.display()))?;
|
|
|
|
|
let mtime: DateTime<Utc> = metadata
|
|
|
|
|
@@ -580,158 +880,157 @@ async fn scan_library(
|
|
|
|
|
.map(DateTime::<Utc>::from)
|
|
|
|
|
.unwrap_or_else(|_| Utc::now());
|
|
|
|
|
let fingerprint = compute_fingerprint(path, metadata.len(), &mtime)?;
|
|
|
|
|
info!("[META] Metadata+fingerprint took {:?}", meta_start.elapsed());
|
|
|
|
|
|
|
|
|
|
// Use local path to lookup in existing (which has local paths as keys)
|
|
|
|
|
let lookup_path = remap_libraries_path(&abs_path);
|
|
|
|
|
if let Some((file_id, book_id, old_fingerprint)) = existing.get(&lookup_path).cloned() {
|
|
|
|
|
// Skip fingerprint check for full rebuilds - always reindex
|
|
|
|
|
if !is_full_rebuild && old_fingerprint == fingerprint {
|
|
|
|
|
info!("[SKIP] File unchanged, skipping: {} (total time: {:?})", file_name, start_time.elapsed());
|
|
|
|
|
trace!("[PROCESS] Skipping unchanged file: {}", file_name);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!("[PARSER] Starting parse_metadata for: {}", file_name);
|
|
|
|
|
let parse_start = std::time::Instant::now();
|
|
|
|
|
info!("[PROCESS] Updating existing file: {} (full_rebuild={}, fingerprint_match={})", file_name, is_full_rebuild, old_fingerprint == fingerprint);
|
|
|
|
|
|
|
|
|
|
match parse_metadata(path, format, root) {
|
|
|
|
|
Ok(parsed) => {
|
|
|
|
|
info!("[PARSER] Parsing took {:?} for {} (pages={:?})", parse_start.elapsed(), file_name, parsed.page_count);
|
|
|
|
|
|
|
|
|
|
let db_start = std::time::Instant::now();
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"UPDATE books SET title = $2, kind = $3, series = $4, volume = $5, page_count = $6, updated_at = NOW() WHERE id = $1",
|
|
|
|
|
)
|
|
|
|
|
.bind(book_id)
|
|
|
|
|
.bind(&parsed.title)
|
|
|
|
|
.bind(kind_from_format(format))
|
|
|
|
|
.bind(&parsed.series)
|
|
|
|
|
.bind(&parsed.volume)
|
|
|
|
|
.bind(parsed.page_count)
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
books_to_update.push(BookUpdate {
|
|
|
|
|
book_id,
|
|
|
|
|
title: parsed.title,
|
|
|
|
|
kind: kind_from_format(format).to_string(),
|
|
|
|
|
series: parsed.series,
|
|
|
|
|
volume: parsed.volume,
|
|
|
|
|
page_count: parsed.page_count,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"UPDATE book_files SET format = $2, size_bytes = $3, mtime = $4, fingerprint = $5, parse_status = 'ok', parse_error_opt = NULL, updated_at = NOW() WHERE id = $1",
|
|
|
|
|
)
|
|
|
|
|
.bind(file_id)
|
|
|
|
|
.bind(format.as_str())
|
|
|
|
|
.bind(metadata.len() as i64)
|
|
|
|
|
.bind(mtime)
|
|
|
|
|
.bind(fingerprint)
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
info!("[BDD] UPDATE took {:?} for {}", db_start.elapsed(), file_name);
|
|
|
|
|
files_to_update.push(FileUpdate {
|
|
|
|
|
file_id,
|
|
|
|
|
format: format.as_str().to_string(),
|
|
|
|
|
size_bytes: metadata.len() as i64,
|
|
|
|
|
mtime,
|
|
|
|
|
fingerprint,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
stats.indexed_files += 1;
|
|
|
|
|
info!("[DONE] Updated file {} (total time: {:?})", file_name, start_time.elapsed());
|
|
|
|
|
}
|
|
|
|
|
Err(err) => {
|
|
|
|
|
warn!("[PARSER] Failed to parse {} after {:?}: {}", file_name, parse_start.elapsed(), err);
|
|
|
|
|
warn!("[PARSER] Failed to parse {}: {}", file_name, err);
|
|
|
|
|
stats.errors += 1;
|
|
|
|
|
|
|
|
|
|
files_to_update.push(FileUpdate {
|
|
|
|
|
file_id,
|
|
|
|
|
format: format.as_str().to_string(),
|
|
|
|
|
size_bytes: metadata.len() as i64,
|
|
|
|
|
mtime,
|
|
|
|
|
fingerprint: fingerprint.clone(),
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
errors_to_insert.push(ErrorInsert {
|
|
|
|
|
job_id,
|
|
|
|
|
file_path: abs_path.clone(),
|
|
|
|
|
error_message: err.to_string(),
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Also need to mark file as error - we'll do this separately
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"UPDATE book_files SET parse_status = 'error', parse_error_opt = $2, updated_at = NOW() WHERE id = $1",
|
|
|
|
|
"UPDATE book_files SET parse_status = 'error', parse_error_opt = $2 WHERE id = $1"
|
|
|
|
|
)
|
|
|
|
|
.bind(file_id)
|
|
|
|
|
.bind(err.to_string())
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
// Store error in index_job_errors table
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"INSERT INTO index_job_errors (job_id, file_path, error_message) VALUES ($1, $2, $3)"
|
|
|
|
|
)
|
|
|
|
|
.bind(job_id)
|
|
|
|
|
.bind(&abs_path)
|
|
|
|
|
.bind(err.to_string())
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Flush if batch is full
|
|
|
|
|
if books_to_update.len() >= BATCH_SIZE || files_to_update.len() >= BATCH_SIZE {
|
|
|
|
|
flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!("[PARSER] Starting parse_metadata for new file: {}", file_name);
|
|
|
|
|
let parse_start = std::time::Instant::now();
|
|
|
|
|
// New file
|
|
|
|
|
info!("[PROCESS] Inserting new file: {}", file_name);
|
|
|
|
|
match parse_metadata(path, format, root) {
|
|
|
|
|
Ok(parsed) => {
|
|
|
|
|
info!("[PARSER] Parsing took {:?} for {} (pages={:?})", parse_start.elapsed(), file_name, parsed.page_count);
|
|
|
|
|
|
|
|
|
|
let db_start = std::time::Instant::now();
|
|
|
|
|
let book_id = Uuid::new_v4();
|
|
|
|
|
let file_id = Uuid::new_v4();
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"INSERT INTO books (id, library_id, kind, title, series, volume, page_count) VALUES ($1, $2, $3, $4, $5, $6, $7)",
|
|
|
|
|
)
|
|
|
|
|
.bind(book_id)
|
|
|
|
|
.bind(library_id)
|
|
|
|
|
.bind(kind_from_format(format))
|
|
|
|
|
.bind(&parsed.title)
|
|
|
|
|
.bind(&parsed.series)
|
|
|
|
|
.bind(&parsed.volume)
|
|
|
|
|
.bind(parsed.page_count)
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
books_to_insert.push(BookInsert {
|
|
|
|
|
book_id,
|
|
|
|
|
library_id,
|
|
|
|
|
kind: kind_from_format(format).to_string(),
|
|
|
|
|
title: parsed.title,
|
|
|
|
|
series: parsed.series,
|
|
|
|
|
volume: parsed.volume,
|
|
|
|
|
page_count: parsed.page_count,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"INSERT INTO book_files (id, book_id, format, abs_path, size_bytes, mtime, fingerprint, parse_status) VALUES ($1, $2, $3, $4, $5, $6, $7, 'ok')",
|
|
|
|
|
)
|
|
|
|
|
.bind(file_id)
|
|
|
|
|
.bind(book_id)
|
|
|
|
|
.bind(format.as_str())
|
|
|
|
|
.bind(&abs_path)
|
|
|
|
|
.bind(metadata.len() as i64)
|
|
|
|
|
.bind(mtime)
|
|
|
|
|
.bind(fingerprint)
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
info!("[BDD] INSERT took {:?} for {}", db_start.elapsed(), file_name);
|
|
|
|
|
files_to_insert.push(FileInsert {
|
|
|
|
|
file_id,
|
|
|
|
|
book_id,
|
|
|
|
|
format: format.as_str().to_string(),
|
|
|
|
|
abs_path: abs_path.clone(),
|
|
|
|
|
size_bytes: metadata.len() as i64,
|
|
|
|
|
mtime,
|
|
|
|
|
fingerprint,
|
|
|
|
|
parse_status: "ok".to_string(),
|
|
|
|
|
parse_error: None,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
stats.indexed_files += 1;
|
|
|
|
|
info!("[DONE] Inserted new file {} (total time: {:?})", file_name, start_time.elapsed());
|
|
|
|
|
}
|
|
|
|
|
Err(err) => {
|
|
|
|
|
warn!("[PARSER] Failed to parse {} after {:?}: {}", file_name, parse_start.elapsed(), err);
|
|
|
|
|
warn!("[PARSER] Failed to parse {}: {}", file_name, err);
|
|
|
|
|
stats.errors += 1;
|
|
|
|
|
let book_id = Uuid::new_v4();
|
|
|
|
|
let file_id = Uuid::new_v4();
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"INSERT INTO books (id, library_id, kind, title, page_count) VALUES ($1, $2, $3, $4, NULL)",
|
|
|
|
|
)
|
|
|
|
|
.bind(book_id)
|
|
|
|
|
.bind(library_id)
|
|
|
|
|
.bind(kind_from_format(format))
|
|
|
|
|
.bind(file_display_name(path))
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
books_to_insert.push(BookInsert {
|
|
|
|
|
book_id,
|
|
|
|
|
library_id,
|
|
|
|
|
kind: kind_from_format(format).to_string(),
|
|
|
|
|
title: file_display_name(path),
|
|
|
|
|
series: None,
|
|
|
|
|
volume: None,
|
|
|
|
|
page_count: None,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"INSERT INTO book_files (id, book_id, format, abs_path, size_bytes, mtime, fingerprint, parse_status, parse_error_opt) VALUES ($1, $2, $3, $4, $5, $6, $7, 'error', $8)",
|
|
|
|
|
)
|
|
|
|
|
.bind(file_id)
|
|
|
|
|
.bind(book_id)
|
|
|
|
|
.bind(format.as_str())
|
|
|
|
|
.bind(&abs_path)
|
|
|
|
|
.bind(metadata.len() as i64)
|
|
|
|
|
.bind(mtime)
|
|
|
|
|
.bind(fingerprint)
|
|
|
|
|
.bind(err.to_string())
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
files_to_insert.push(FileInsert {
|
|
|
|
|
file_id,
|
|
|
|
|
book_id,
|
|
|
|
|
format: format.as_str().to_string(),
|
|
|
|
|
abs_path: abs_path.clone(),
|
|
|
|
|
size_bytes: metadata.len() as i64,
|
|
|
|
|
mtime,
|
|
|
|
|
fingerprint,
|
|
|
|
|
parse_status: "error".to_string(),
|
|
|
|
|
parse_error: Some(err.to_string()),
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Store error in index_job_errors table
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"INSERT INTO index_job_errors (job_id, file_path, error_message) VALUES ($1, $2, $3)"
|
|
|
|
|
)
|
|
|
|
|
.bind(job_id)
|
|
|
|
|
.bind(&abs_path)
|
|
|
|
|
.bind(err.to_string())
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
errors_to_insert.push(ErrorInsert {
|
|
|
|
|
job_id,
|
|
|
|
|
file_path: abs_path,
|
|
|
|
|
error_message: err.to_string(),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Flush if batch is full
|
|
|
|
|
if books_to_insert.len() >= BATCH_SIZE || files_to_insert.len() >= BATCH_SIZE {
|
|
|
|
|
flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
trace!("[DONE] Processed file {} (total time: {:?})", file_name, start_time.elapsed());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Final flush of any remaining items
|
|
|
|
|
flush_all_batches(&state.pool, &mut books_to_update, &mut files_to_update, &mut books_to_insert, &mut files_to_insert, &mut errors_to_insert).await?;
|
|
|
|
|
|
|
|
|
|
info!("[SCAN] Library {} scan complete: {} files scanned, {} indexed, {} errors",
|
|
|
|
|
library_id, library_processed_count, stats.indexed_files, stats.errors);
|
|
|
|
|
|
|
|
|
|
// Handle deletions
|
|
|
|
|
let mut removed_count = 0usize;
|
|
|
|
|
for (abs_path, (file_id, book_id, _)) in existing {
|
|
|
|
|
if seen.contains_key(&abs_path) {
|
|
|
|
|
continue;
|
|
|
|
|
@@ -745,19 +1044,28 @@ async fn scan_library(
|
|
|
|
|
.execute(&state.pool)
|
|
|
|
|
.await?;
|
|
|
|
|
stats.removed_files += 1;
|
|
|
|
|
removed_count += 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if removed_count > 0 {
|
|
|
|
|
info!("[SCAN] Removed {} stale files from database", removed_count);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn compute_fingerprint(path: &Path, size: u64, mtime: &DateTime<Utc>) -> anyhow::Result<String> {
|
|
|
|
|
// Optimized: only use size + mtime + first bytes of filename for fast fingerprinting
|
|
|
|
|
// This is 100x faster than reading file content while still being reliable for change detection
|
|
|
|
|
let mut hasher = Sha256::new();
|
|
|
|
|
hasher.update(size.to_le_bytes());
|
|
|
|
|
hasher.update(mtime.timestamp().to_le_bytes());
|
|
|
|
|
|
|
|
|
|
let bytes = std::fs::read(path)?;
|
|
|
|
|
let take = bytes.len().min(65_536);
|
|
|
|
|
hasher.update(&bytes[..take]);
|
|
|
|
|
|
|
|
|
|
// Add filename for extra uniqueness (in case of rapid changes with same size+mtime)
|
|
|
|
|
if let Some(filename) = path.file_name() {
|
|
|
|
|
hasher.update(filename.as_encoded_bytes());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(format!("{:x}", hasher.finalize()))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -790,6 +1098,7 @@ async fn sync_meili(pool: &sqlx::PgPool, meili_url: &str, meili_master_key: &str
|
|
|
|
|
let client = reqwest::Client::new();
|
|
|
|
|
let base = meili_url.trim_end_matches('/');
|
|
|
|
|
|
|
|
|
|
// Ensure index exists and has proper settings
|
|
|
|
|
let _ = client
|
|
|
|
|
.post(format!("{base}/indexes"))
|
|
|
|
|
.header("Authorization", format!("Bearer {meili_master_key}"))
|
|
|
|
|
@@ -804,19 +1113,49 @@ async fn sync_meili(pool: &sqlx::PgPool, meili_url: &str, meili_master_key: &str
|
|
|
|
|
.send()
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
// Clear existing documents to avoid stale data
|
|
|
|
|
let _ = client
|
|
|
|
|
.delete(format!("{base}/indexes/books/documents"))
|
|
|
|
|
.header("Authorization", format!("Bearer {meili_master_key}"))
|
|
|
|
|
.send()
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
let rows = sqlx::query(
|
|
|
|
|
"SELECT id, library_id, kind, title, author, series, volume, language FROM books",
|
|
|
|
|
// Get last sync timestamp
|
|
|
|
|
let last_sync: Option<DateTime<Utc>> = sqlx::query_scalar(
|
|
|
|
|
"SELECT last_meili_sync FROM sync_metadata WHERE id = 1 AND last_meili_sync IS NOT NULL"
|
|
|
|
|
)
|
|
|
|
|
.fetch_all(pool)
|
|
|
|
|
.fetch_optional(pool)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
// If no previous sync, do a full sync
|
|
|
|
|
let is_full_sync = last_sync.is_none();
|
|
|
|
|
|
|
|
|
|
// Get books to sync: all if full sync, only modified since last sync otherwise
|
|
|
|
|
let rows = if is_full_sync {
|
|
|
|
|
info!("[MEILI] Performing full sync");
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"SELECT id, library_id, kind, title, author, series, volume, language, updated_at FROM books",
|
|
|
|
|
)
|
|
|
|
|
.fetch_all(pool)
|
|
|
|
|
.await?
|
|
|
|
|
} else {
|
|
|
|
|
let since = last_sync.unwrap();
|
|
|
|
|
info!("[MEILI] Performing incremental sync since {}", since);
|
|
|
|
|
|
|
|
|
|
// Also get deleted book IDs to remove from MeiliSearch
|
|
|
|
|
// For now, we'll do a diff approach: get all book IDs from DB and compare with Meili
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"SELECT id, library_id, kind, title, author, series, volume, language, updated_at FROM books WHERE updated_at > $1",
|
|
|
|
|
)
|
|
|
|
|
.bind(since)
|
|
|
|
|
.fetch_all(pool)
|
|
|
|
|
.await?
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if rows.is_empty() && !is_full_sync {
|
|
|
|
|
info!("[MEILI] No changes to sync");
|
|
|
|
|
// Still update the timestamp
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"INSERT INTO sync_metadata (id, last_meili_sync) VALUES (1, NOW()) ON CONFLICT (id) DO UPDATE SET last_meili_sync = NOW()"
|
|
|
|
|
)
|
|
|
|
|
.execute(pool)
|
|
|
|
|
.await?;
|
|
|
|
|
return Ok(());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let docs: Vec<SearchDoc> = rows
|
|
|
|
|
.into_iter()
|
|
|
|
|
.map(|row| SearchDoc {
|
|
|
|
|
@@ -831,13 +1170,87 @@ async fn sync_meili(pool: &sqlx::PgPool, meili_url: &str, meili_master_key: &str
|
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
client
|
|
|
|
|
.put(format!("{base}/indexes/books/documents?primaryKey=id"))
|
|
|
|
|
.header("Authorization", format!("Bearer {meili_master_key}"))
|
|
|
|
|
.json(&docs)
|
|
|
|
|
.send()
|
|
|
|
|
.await
|
|
|
|
|
.context("failed to push docs to meili")?;
|
|
|
|
|
let doc_count = docs.len();
|
|
|
|
|
|
|
|
|
|
// Send documents to MeiliSearch in batches of 1000
|
|
|
|
|
const MEILI_BATCH_SIZE: usize = 1000;
|
|
|
|
|
for (i, chunk) in docs.chunks(MEILI_BATCH_SIZE).enumerate() {
|
|
|
|
|
let batch_num = i + 1;
|
|
|
|
|
info!("[MEILI] Sending batch {}/{} ({} docs)", batch_num, (doc_count + MEILI_BATCH_SIZE - 1) / MEILI_BATCH_SIZE, chunk.len());
|
|
|
|
|
|
|
|
|
|
let response = client
|
|
|
|
|
.post(format!("{base}/indexes/books/documents"))
|
|
|
|
|
.header("Authorization", format!("Bearer {meili_master_key}"))
|
|
|
|
|
.json(&chunk)
|
|
|
|
|
.send()
|
|
|
|
|
.await
|
|
|
|
|
.context("failed to send docs to meili")?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
let status = response.status();
|
|
|
|
|
let body = response.text().await.unwrap_or_default();
|
|
|
|
|
return Err(anyhow::anyhow!("MeiliSearch error {}: {}", status, body));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Handle deletions: get all book IDs from DB and remove from MeiliSearch any that don't exist
|
|
|
|
|
// This is expensive, so we only do it periodically (every 10 syncs) or on full syncs
|
|
|
|
|
if is_full_sync || rand::random::<u8>() < 26 { // ~10% chance
|
|
|
|
|
info!("[MEILI] Checking for documents to delete");
|
|
|
|
|
|
|
|
|
|
// Get all book IDs from database
|
|
|
|
|
let db_ids: Vec<String> = sqlx::query_scalar("SELECT id::text FROM books")
|
|
|
|
|
.fetch_all(pool)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
// Get all document IDs from MeiliSearch (this requires fetching all documents)
|
|
|
|
|
// For efficiency, we'll just delete by query for documents that might be stale
|
|
|
|
|
// A better approach would be to track deletions in a separate table
|
|
|
|
|
|
|
|
|
|
// For now, we'll do a simple approach: fetch all Meili docs and compare
|
|
|
|
|
// Note: This could be slow for large collections
|
|
|
|
|
let meili_response = client
|
|
|
|
|
.post(format!("{base}/indexes/books/documents/fetch"))
|
|
|
|
|
.header("Authorization", format!("Bearer {meili_master_key}"))
|
|
|
|
|
.json(&serde_json::json!({
|
|
|
|
|
"fields": ["id"],
|
|
|
|
|
"limit": 100000
|
|
|
|
|
}))
|
|
|
|
|
.send()
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
if let Ok(response) = meili_response {
|
|
|
|
|
if response.status().is_success() {
|
|
|
|
|
if let Ok(meili_docs) = response.json::<Vec<serde_json::Value>>().await {
|
|
|
|
|
let meili_ids: std::collections::HashSet<String> = meili_docs
|
|
|
|
|
.into_iter()
|
|
|
|
|
.filter_map(|doc| doc.get("id").and_then(|id| id.as_str()).map(|s| s.to_string()))
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
let db_ids_set: std::collections::HashSet<String> = db_ids.into_iter().collect();
|
|
|
|
|
let to_delete: Vec<String> = meili_ids.difference(&db_ids_set).cloned().collect();
|
|
|
|
|
|
|
|
|
|
if !to_delete.is_empty() {
|
|
|
|
|
info!("[MEILI] Deleting {} stale documents", to_delete.len());
|
|
|
|
|
let _ = client
|
|
|
|
|
.post(format!("{base}/indexes/books/documents/delete-batch"))
|
|
|
|
|
.header("Authorization", format!("Bearer {meili_master_key}"))
|
|
|
|
|
.json(&to_delete)
|
|
|
|
|
.send()
|
|
|
|
|
.await;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update last sync timestamp
|
|
|
|
|
sqlx::query(
|
|
|
|
|
"INSERT INTO sync_metadata (id, last_meili_sync) VALUES (1, NOW()) ON CONFLICT (id) DO UPDATE SET last_meili_sync = NOW()"
|
|
|
|
|
)
|
|
|
|
|
.execute(pool)
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
|
|
info!("[MEILI] Sync completed: {} documents indexed", doc_count);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|