add indexing jobs, parsers, and search APIs

This commit is contained in:
2026-03-05 15:05:34 +01:00
parent 88db9805b5
commit 6eaf2ba5dc
17 changed files with 1548 additions and 46 deletions

View File

@@ -12,6 +12,7 @@ base64.workspace = true
chrono.workspace = true
stripstream-core = { path = "../../crates/core" }
rand.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json.workspace = true
sqlx.workspace = true

View File

@@ -32,6 +32,18 @@ pub async fn require_admin(
Ok(next.run(req).await)
}
pub async fn require_read(
State(state): State<AppState>,
mut req: Request,
next: Next,
) -> Result<Response, ApiError> {
let token = bearer_token(&req).ok_or_else(|| ApiError::unauthorized("missing bearer token"))?;
let scope = authenticate(&state, token).await?;
req.extensions_mut().insert(scope);
Ok(next.run(req).await)
}
fn bearer_token(req: &Request) -> Option<&str> {
req.headers()
.get(AUTHORIZATION)

143
apps/api/src/books.rs Normal file
View File

@@ -0,0 +1,143 @@
use axum::{extract::{Path, Query, State}, Json};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use sqlx::Row;
use uuid::Uuid;
use crate::{error::ApiError, AppState};
#[derive(Deserialize)]
pub struct ListBooksQuery {
pub library_id: Option<Uuid>,
pub kind: Option<String>,
pub cursor: Option<Uuid>,
pub limit: Option<i64>,
}
#[derive(Serialize)]
pub struct BookItem {
pub id: Uuid,
pub library_id: Uuid,
pub kind: String,
pub title: String,
pub author: Option<String>,
pub series: Option<String>,
pub volume: Option<String>,
pub language: Option<String>,
pub page_count: Option<i32>,
pub updated_at: DateTime<Utc>,
}
#[derive(Serialize)]
pub struct BooksPage {
pub items: Vec<BookItem>,
pub next_cursor: Option<Uuid>,
}
#[derive(Serialize)]
pub struct BookDetails {
pub id: Uuid,
pub library_id: Uuid,
pub kind: String,
pub title: String,
pub author: Option<String>,
pub series: Option<String>,
pub volume: Option<String>,
pub language: Option<String>,
pub page_count: Option<i32>,
pub file_path: Option<String>,
pub file_format: Option<String>,
pub file_parse_status: Option<String>,
}
pub async fn list_books(
State(state): State<AppState>,
Query(query): Query<ListBooksQuery>,
) -> Result<Json<BooksPage>, ApiError> {
let limit = query.limit.unwrap_or(50).clamp(1, 200);
let rows = sqlx::query(
r#"
SELECT id, library_id, kind, title, author, series, volume, language, page_count, updated_at
FROM books
WHERE ($1::uuid IS NULL OR library_id = $1)
AND ($2::text IS NULL OR kind = $2)
AND ($3::uuid IS NULL OR id > $3)
ORDER BY id ASC
LIMIT $4
"#,
)
.bind(query.library_id)
.bind(query.kind.as_deref())
.bind(query.cursor)
.bind(limit + 1)
.fetch_all(&state.pool)
.await?;
let mut items: Vec<BookItem> = rows
.iter()
.take(limit as usize)
.map(|row| BookItem {
id: row.get("id"),
library_id: row.get("library_id"),
kind: row.get("kind"),
title: row.get("title"),
author: row.get("author"),
series: row.get("series"),
volume: row.get("volume"),
language: row.get("language"),
page_count: row.get("page_count"),
updated_at: row.get("updated_at"),
})
.collect();
let next_cursor = if rows.len() > limit as usize {
items.last().map(|b| b.id)
} else {
None
};
Ok(Json(BooksPage {
items: std::mem::take(&mut items),
next_cursor,
}))
}
pub async fn get_book(
State(state): State<AppState>,
Path(id): Path<Uuid>,
) -> Result<Json<BookDetails>, ApiError> {
let row = sqlx::query(
r#"
SELECT b.id, b.library_id, b.kind, b.title, b.author, b.series, b.volume, b.language, b.page_count,
bf.abs_path, bf.format, bf.parse_status
FROM books b
LEFT JOIN LATERAL (
SELECT abs_path, format, parse_status
FROM book_files
WHERE book_id = b.id
ORDER BY updated_at DESC
LIMIT 1
) bf ON TRUE
WHERE b.id = $1
"#,
)
.bind(id)
.fetch_optional(&state.pool)
.await?;
let row = row.ok_or_else(|| ApiError::not_found("book not found"))?;
Ok(Json(BookDetails {
id: row.get("id"),
library_id: row.get("library_id"),
kind: row.get("kind"),
title: row.get("title"),
author: row.get("author"),
series: row.get("series"),
volume: row.get("volume"),
language: row.get("language"),
page_count: row.get("page_count"),
file_path: row.get("abs_path"),
file_format: row.get("format"),
file_parse_status: row.get("parse_status"),
}))
}

View File

@@ -0,0 +1,74 @@
use axum::{extract::State, Json};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use sqlx::Row;
use uuid::Uuid;
use crate::{error::ApiError, AppState};
#[derive(Deserialize)]
pub struct RebuildRequest {
pub library_id: Option<Uuid>,
}
#[derive(Serialize)]
pub struct IndexJobItem {
pub id: Uuid,
pub library_id: Option<Uuid>,
pub r#type: String,
pub status: String,
pub started_at: Option<DateTime<Utc>>,
pub finished_at: Option<DateTime<Utc>>,
pub stats_json: Option<serde_json::Value>,
pub error_opt: Option<String>,
pub created_at: DateTime<Utc>,
}
pub async fn enqueue_rebuild(
State(state): State<AppState>,
payload: Option<Json<RebuildRequest>>,
) -> Result<Json<IndexJobItem>, ApiError> {
let library_id = payload.and_then(|p| p.0.library_id);
let id = Uuid::new_v4();
sqlx::query(
"INSERT INTO index_jobs (id, library_id, type, status) VALUES ($1, $2, 'rebuild', 'pending')",
)
.bind(id)
.bind(library_id)
.execute(&state.pool)
.await?;
let row = sqlx::query(
"SELECT id, library_id, type, status, started_at, finished_at, stats_json, error_opt, created_at FROM index_jobs WHERE id = $1",
)
.bind(id)
.fetch_one(&state.pool)
.await?;
Ok(Json(map_row(row)))
}
pub async fn list_index_jobs(State(state): State<AppState>) -> Result<Json<Vec<IndexJobItem>>, ApiError> {
let rows = sqlx::query(
"SELECT id, library_id, type, status, started_at, finished_at, stats_json, error_opt, created_at FROM index_jobs ORDER BY created_at DESC LIMIT 100",
)
.fetch_all(&state.pool)
.await?;
Ok(Json(rows.into_iter().map(map_row).collect()))
}
fn map_row(row: sqlx::postgres::PgRow) -> IndexJobItem {
IndexJobItem {
id: row.get("id"),
library_id: row.get("library_id"),
r#type: row.get("type"),
status: row.get("status"),
started_at: row.get("started_at"),
finished_at: row.get("finished_at"),
stats_json: row.get("stats_json"),
error_opt: row.get("error_opt"),
created_at: row.get("created_at"),
}
}

View File

@@ -1,6 +1,9 @@
mod auth;
mod books;
mod error;
mod index_jobs;
mod libraries;
mod search;
mod tokens;
use std::sync::Arc;
@@ -14,6 +17,8 @@ use tracing::info;
struct AppState {
pool: sqlx::PgPool,
bootstrap_token: Arc<str>,
meili_url: Arc<str>,
meili_master_key: Arc<str>,
}
#[tokio::main]
@@ -33,18 +38,29 @@ async fn main() -> anyhow::Result<()> {
let state = AppState {
pool,
bootstrap_token: Arc::from(config.api_bootstrap_token),
meili_url: Arc::from(config.meili_url),
meili_master_key: Arc::from(config.meili_master_key),
};
let protected = Router::new()
let admin_routes = Router::new()
.route("/libraries", get(libraries::list_libraries).post(libraries::create_library))
.route("/libraries/:id", delete(libraries::delete_library))
.route("/index/rebuild", axum::routing::post(index_jobs::enqueue_rebuild))
.route("/index/status", get(index_jobs::list_index_jobs))
.route("/admin/tokens", get(tokens::list_tokens).post(tokens::create_token))
.route("/admin/tokens/:id", delete(tokens::revoke_token))
.layer(middleware::from_fn_with_state(state.clone(), auth::require_admin));
let read_routes = Router::new()
.route("/books", get(books::list_books))
.route("/books/:id", get(books::get_book))
.route("/search", get(search::search_books))
.layer(middleware::from_fn_with_state(state.clone(), auth::require_read));
let app = Router::new()
.route("/health", get(health))
.merge(protected)
.merge(admin_routes)
.merge(read_routes)
.with_state(state);
let listener = tokio::net::TcpListener::bind(&config.listen_addr).await?;

77
apps/api/src/search.rs Normal file
View File

@@ -0,0 +1,77 @@
use axum::{extract::{Query, State}, Json};
use serde::{Deserialize, Serialize};
use crate::{error::ApiError, AppState};
#[derive(Deserialize)]
pub struct SearchQuery {
pub q: String,
pub library_id: Option<String>,
pub r#type: Option<String>,
pub kind: Option<String>,
pub limit: Option<usize>,
}
#[derive(Serialize)]
pub struct SearchResponse {
pub hits: serde_json::Value,
pub estimated_total_hits: Option<u64>,
pub processing_time_ms: Option<u64>,
}
pub async fn search_books(
State(state): State<AppState>,
Query(query): Query<SearchQuery>,
) -> Result<Json<SearchResponse>, ApiError> {
if query.q.trim().is_empty() {
return Err(ApiError::bad_request("q is required"));
}
let mut filters: Vec<String> = Vec::new();
if let Some(library_id) = query.library_id.as_deref() {
filters.push(format!("library_id = '{}'", library_id.replace('"', "")));
}
let kind_filter = query.r#type.as_deref().or(query.kind.as_deref());
if let Some(kind) = kind_filter {
filters.push(format!("kind = '{}'", kind.replace('"', "")));
}
let body = serde_json::json!({
"q": query.q,
"limit": query.limit.unwrap_or(20).clamp(1, 100),
"filter": if filters.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(filters.join(" AND ")) }
});
let client = reqwest::Client::new();
let url = format!("{}/indexes/books/search", state.meili_url.trim_end_matches('/'));
let response = client
.post(url)
.header("Authorization", format!("Bearer {}", state.meili_master_key))
.json(&body)
.send()
.await
.map_err(|e| ApiError::internal(format!("meili request failed: {e}")))?;
if !response.status().is_success() {
let body = response.text().await.unwrap_or_else(|_| "unknown meili error".to_string());
if body.contains("index_not_found") {
return Ok(Json(SearchResponse {
hits: serde_json::json!([]),
estimated_total_hits: Some(0),
processing_time_ms: Some(0),
}));
}
return Err(ApiError::internal(format!("meili error: {body}")));
}
let payload: serde_json::Value = response
.json()
.await
.map_err(|e| ApiError::internal(format!("invalid meili response: {e}")))?;
Ok(Json(SearchResponse {
hits: payload.get("hits").cloned().unwrap_or_else(|| serde_json::json!([])),
estimated_total_hits: payload.get("estimatedTotalHits").and_then(|v| v.as_u64()),
processing_time_ms: payload.get("processingTimeMs").and_then(|v| v.as_u64()),
}))
}

View File

@@ -7,7 +7,16 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
axum.workspace = true
chrono.workspace = true
parsers = { path = "../../crates/parsers" }
reqwest.workspace = true
serde.workspace = true
serde_json.workspace = true
sha2.workspace = true
sqlx.workspace = true
stripstream-core = { path = "../../crates/core" }
tokio.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true
uuid.workspace = true
walkdir.workspace = true

View File

@@ -1,6 +1,30 @@
use anyhow::Context;
use axum::{routing::get, Router};
use chrono::{DateTime, Utc};
use parsers::{detect_format, parse_metadata, BookFormat};
use serde::Serialize;
use sha2::{Digest, Sha256};
use sqlx::{postgres::PgPoolOptions, Row};
use std::{collections::HashMap, path::Path, time::Duration};
use stripstream_core::config::IndexerConfig;
use tracing::info;
use tracing::{error, info};
use uuid::Uuid;
use walkdir::WalkDir;
#[derive(Clone)]
struct AppState {
pool: sqlx::PgPool,
meili_url: String,
meili_master_key: String,
}
#[derive(Serialize)]
struct JobStats {
scanned_files: usize,
indexed_files: usize,
removed_files: usize,
errors: usize,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
@@ -10,7 +34,20 @@ async fn main() -> anyhow::Result<()> {
)
.init();
let config = IndexerConfig::from_env();
let config = IndexerConfig::from_env()?;
let pool = PgPoolOptions::new()
.max_connections(5)
.connect(&config.database_url)
.await?;
let state = AppState {
pool,
meili_url: config.meili_url.clone(),
meili_master_key: config.meili_master_key.clone(),
};
tokio::spawn(run_worker(state.clone(), config.scan_interval_seconds));
let app = Router::new().route("/health", get(health));
let listener = tokio::net::TcpListener::bind(&config.listen_addr).await?;
@@ -22,3 +59,363 @@ async fn main() -> anyhow::Result<()> {
async fn health() -> &'static str {
"ok"
}
async fn run_worker(state: AppState, interval_seconds: u64) {
let wait = Duration::from_secs(interval_seconds.max(1));
loop {
match claim_next_job(&state.pool).await {
Ok(Some((job_id, library_id))) => {
if let Err(err) = process_job(&state, job_id, library_id).await {
error!(job_id = %job_id, error = %err, "index job failed");
let _ = fail_job(&state.pool, job_id, &err.to_string()).await;
}
}
Ok(None) => tokio::time::sleep(wait).await,
Err(err) => {
error!(error = %err, "worker loop error");
tokio::time::sleep(wait).await;
}
}
}
}
async fn claim_next_job(pool: &sqlx::PgPool) -> anyhow::Result<Option<(Uuid, Option<Uuid>)>> {
let mut tx = pool.begin().await?;
let row = sqlx::query(
r#"
SELECT id, library_id
FROM index_jobs
WHERE status = 'pending'
ORDER BY created_at ASC
FOR UPDATE SKIP LOCKED
LIMIT 1
"#,
)
.fetch_optional(&mut *tx)
.await?;
let Some(row) = row else {
tx.commit().await?;
return Ok(None);
};
let id: Uuid = row.get("id");
let library_id: Option<Uuid> = row.get("library_id");
sqlx::query("UPDATE index_jobs SET status = 'running', started_at = NOW(), error_opt = NULL WHERE id = $1")
.bind(id)
.execute(&mut *tx)
.await?;
tx.commit().await?;
Ok(Some((id, library_id)))
}
async fn process_job(state: &AppState, job_id: Uuid, target_library_id: Option<Uuid>) -> anyhow::Result<()> {
let libraries = if let Some(library_id) = target_library_id {
sqlx::query("SELECT id, root_path FROM libraries WHERE id = $1 AND enabled = TRUE")
.bind(library_id)
.fetch_all(&state.pool)
.await?
} else {
sqlx::query("SELECT id, root_path FROM libraries WHERE enabled = TRUE")
.fetch_all(&state.pool)
.await?
};
let mut stats = JobStats {
scanned_files: 0,
indexed_files: 0,
removed_files: 0,
errors: 0,
};
for library in libraries {
let library_id: Uuid = library.get("id");
let root_path: String = library.get("root_path");
match scan_library(state, library_id, Path::new(&root_path), &mut stats).await {
Ok(()) => {}
Err(err) => {
stats.errors += 1;
error!(library_id = %library_id, error = %err, "library scan failed");
}
}
}
sync_meili(&state.pool, &state.meili_url, &state.meili_master_key).await?;
sqlx::query("UPDATE index_jobs SET status = 'success', finished_at = NOW(), stats_json = $2 WHERE id = $1")
.bind(job_id)
.bind(serde_json::to_value(&stats)?)
.execute(&state.pool)
.await?;
Ok(())
}
async fn fail_job(pool: &sqlx::PgPool, job_id: Uuid, error_message: &str) -> anyhow::Result<()> {
sqlx::query("UPDATE index_jobs SET status = 'failed', finished_at = NOW(), error_opt = $2 WHERE id = $1")
.bind(job_id)
.bind(error_message)
.execute(pool)
.await?;
Ok(())
}
async fn scan_library(
state: &AppState,
library_id: Uuid,
root: &Path,
stats: &mut JobStats,
) -> anyhow::Result<()> {
let existing_rows = sqlx::query(
r#"
SELECT bf.id AS file_id, bf.book_id, bf.abs_path, bf.fingerprint
FROM book_files bf
JOIN books b ON b.id = bf.book_id
WHERE b.library_id = $1
"#,
)
.bind(library_id)
.fetch_all(&state.pool)
.await?;
let mut existing: HashMap<String, (Uuid, Uuid, String)> = HashMap::new();
for row in existing_rows {
existing.insert(
row.get("abs_path"),
(row.get("file_id"), row.get("book_id"), row.get("fingerprint")),
);
}
let mut seen: HashMap<String, bool> = HashMap::new();
for entry in WalkDir::new(root).into_iter().filter_map(Result::ok) {
if !entry.file_type().is_file() {
continue;
}
let path = entry.path();
let Some(format) = detect_format(path) else {
continue;
};
stats.scanned_files += 1;
let abs_path = path.to_string_lossy().to_string();
seen.insert(abs_path.clone(), true);
let metadata = std::fs::metadata(path)
.with_context(|| format!("cannot stat {}", path.display()))?;
let mtime: DateTime<Utc> = metadata
.modified()
.map(DateTime::<Utc>::from)
.unwrap_or_else(|_| Utc::now());
let fingerprint = compute_fingerprint(path, metadata.len(), &mtime)?;
if let Some((file_id, book_id, old_fingerprint)) = existing.get(&abs_path).cloned() {
if old_fingerprint == fingerprint {
continue;
}
match parse_metadata(path, format) {
Ok(parsed) => {
sqlx::query(
"UPDATE books SET title = $2, kind = $3, page_count = $4, updated_at = NOW() WHERE id = $1",
)
.bind(book_id)
.bind(parsed.title)
.bind(kind_from_format(format))
.bind(parsed.page_count)
.execute(&state.pool)
.await?;
sqlx::query(
"UPDATE book_files SET format = $2, size_bytes = $3, mtime = $4, fingerprint = $5, parse_status = 'ok', parse_error_opt = NULL, updated_at = NOW() WHERE id = $1",
)
.bind(file_id)
.bind(format.as_str())
.bind(metadata.len() as i64)
.bind(mtime)
.bind(fingerprint)
.execute(&state.pool)
.await?;
stats.indexed_files += 1;
}
Err(err) => {
stats.errors += 1;
sqlx::query(
"UPDATE book_files SET parse_status = 'error', parse_error_opt = $2, updated_at = NOW() WHERE id = $1",
)
.bind(file_id)
.bind(err.to_string())
.execute(&state.pool)
.await?;
}
}
continue;
}
match parse_metadata(path, format) {
Ok(parsed) => {
let book_id = Uuid::new_v4();
let file_id = Uuid::new_v4();
sqlx::query(
"INSERT INTO books (id, library_id, kind, title, page_count) VALUES ($1, $2, $3, $4, $5)",
)
.bind(book_id)
.bind(library_id)
.bind(kind_from_format(format))
.bind(parsed.title)
.bind(parsed.page_count)
.execute(&state.pool)
.await?;
sqlx::query(
"INSERT INTO book_files (id, book_id, format, abs_path, size_bytes, mtime, fingerprint, parse_status) VALUES ($1, $2, $3, $4, $5, $6, $7, 'ok')",
)
.bind(file_id)
.bind(book_id)
.bind(format.as_str())
.bind(&abs_path)
.bind(metadata.len() as i64)
.bind(mtime)
.bind(fingerprint)
.execute(&state.pool)
.await?;
stats.indexed_files += 1;
}
Err(err) => {
stats.errors += 1;
let book_id = Uuid::new_v4();
let file_id = Uuid::new_v4();
sqlx::query(
"INSERT INTO books (id, library_id, kind, title, page_count) VALUES ($1, $2, $3, $4, NULL)",
)
.bind(book_id)
.bind(library_id)
.bind(kind_from_format(format))
.bind(file_display_name(path))
.execute(&state.pool)
.await?;
sqlx::query(
"INSERT INTO book_files (id, book_id, format, abs_path, size_bytes, mtime, fingerprint, parse_status, parse_error_opt) VALUES ($1, $2, $3, $4, $5, $6, $7, 'error', $8)",
)
.bind(file_id)
.bind(book_id)
.bind(format.as_str())
.bind(&abs_path)
.bind(metadata.len() as i64)
.bind(mtime)
.bind(fingerprint)
.bind(err.to_string())
.execute(&state.pool)
.await?;
}
}
}
for (abs_path, (file_id, book_id, _)) in existing {
if seen.contains_key(&abs_path) {
continue;
}
sqlx::query("DELETE FROM book_files WHERE id = $1")
.bind(file_id)
.execute(&state.pool)
.await?;
sqlx::query("DELETE FROM books WHERE id = $1 AND NOT EXISTS (SELECT 1 FROM book_files WHERE book_id = $1)")
.bind(book_id)
.execute(&state.pool)
.await?;
stats.removed_files += 1;
}
Ok(())
}
fn compute_fingerprint(path: &Path, size: u64, mtime: &DateTime<Utc>) -> anyhow::Result<String> {
let mut hasher = Sha256::new();
hasher.update(size.to_le_bytes());
hasher.update(mtime.timestamp().to_le_bytes());
let bytes = std::fs::read(path)?;
let take = bytes.len().min(65_536);
hasher.update(&bytes[..take]);
Ok(format!("{:x}", hasher.finalize()))
}
fn kind_from_format(format: BookFormat) -> &'static str {
match format {
BookFormat::Pdf => "ebook",
BookFormat::Cbz | BookFormat::Cbr => "comic",
}
}
fn file_display_name(path: &Path) -> String {
path.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| "Untitled".to_string())
}
#[derive(Serialize)]
struct SearchDoc {
id: String,
library_id: String,
kind: String,
title: String,
author: Option<String>,
series: Option<String>,
volume: Option<String>,
language: Option<String>,
}
async fn sync_meili(pool: &sqlx::PgPool, meili_url: &str, meili_master_key: &str) -> anyhow::Result<()> {
let client = reqwest::Client::new();
let base = meili_url.trim_end_matches('/');
let _ = client
.post(format!("{base}/indexes"))
.header("Authorization", format!("Bearer {meili_master_key}"))
.json(&serde_json::json!({"uid": "books", "primaryKey": "id"}))
.send()
.await;
let _ = client
.patch(format!("{base}/indexes/books/settings/filterable-attributes"))
.header("Authorization", format!("Bearer {meili_master_key}"))
.json(&serde_json::json!(["library_id", "kind"]))
.send()
.await;
let rows = sqlx::query(
"SELECT id, library_id, kind, title, author, series, volume, language FROM books",
)
.fetch_all(pool)
.await?;
let docs: Vec<SearchDoc> = rows
.into_iter()
.map(|row| SearchDoc {
id: row.get::<Uuid, _>("id").to_string(),
library_id: row.get::<Uuid, _>("library_id").to_string(),
kind: row.get("kind"),
title: row.get("title"),
author: row.get("author"),
series: row.get("series"),
volume: row.get("volume"),
language: row.get("language"),
})
.collect();
client
.put(format!("{base}/indexes/books/documents?primaryKey=id"))
.header("Authorization", format!("Bearer {meili_master_key}"))
.json(&docs)
.send()
.await
.context("failed to push docs to meili")?;
Ok(())
}