Add a complete metadata synchronization system allowing users to search and sync series/book metadata from external providers (Google Books, Open Library, ComicVine, AniList, Bédéthèque). Each library can use a different provider. Matching requires manual approval with detailed sync reports showing what was updated or skipped (locked fields protection). Key changes: - DB migrations: external_metadata_links, external_book_metadata tables, library metadata_provider column, locked_fields, total_volumes, book metadata fields (summary, isbn, publish_date) - Rust API: MetadataProvider trait + 5 provider implementations, 7 metadata endpoints (search, match, approve, reject, links, missing, delete), sync report system, provider language preference support - Backoffice: MetadataSearchModal, ProviderIcon, SafeHtml components, settings UI for provider/language config, enriched book detail page, edit forms with locked fields support, API proxy routes - OpenAPI/Swagger documentation for all new endpoints and schemas Closes #3 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
577 lines
19 KiB
Rust
577 lines
19 KiB
Rust
use scraper::{Html, Selector};
|
|
|
|
use super::{BookCandidate, MetadataProvider, ProviderConfig, SeriesCandidate};
|
|
|
|
pub struct BedethequeProvider;
|
|
|
|
impl MetadataProvider for BedethequeProvider {
|
|
fn name(&self) -> &str {
|
|
"bedetheque"
|
|
}
|
|
|
|
fn search_series(
|
|
&self,
|
|
query: &str,
|
|
config: &ProviderConfig,
|
|
) -> std::pin::Pin<
|
|
Box<dyn std::future::Future<Output = Result<Vec<SeriesCandidate>, String>> + Send + '_>,
|
|
> {
|
|
let query = query.to_string();
|
|
let config = config.clone();
|
|
Box::pin(async move { search_series_impl(&query, &config).await })
|
|
}
|
|
|
|
fn get_series_books(
|
|
&self,
|
|
external_id: &str,
|
|
config: &ProviderConfig,
|
|
) -> std::pin::Pin<
|
|
Box<dyn std::future::Future<Output = Result<Vec<BookCandidate>, String>> + Send + '_>,
|
|
> {
|
|
let external_id = external_id.to_string();
|
|
let config = config.clone();
|
|
Box::pin(async move { get_series_books_impl(&external_id, &config).await })
|
|
}
|
|
}
|
|
|
|
fn build_client() -> Result<reqwest::Client, String> {
|
|
reqwest::Client::builder()
|
|
.timeout(std::time::Duration::from_secs(20))
|
|
.user_agent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0")
|
|
.default_headers({
|
|
let mut h = reqwest::header::HeaderMap::new();
|
|
h.insert(
|
|
reqwest::header::ACCEPT,
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
.parse()
|
|
.unwrap(),
|
|
);
|
|
h.insert(
|
|
reqwest::header::ACCEPT_LANGUAGE,
|
|
"fr-FR,fr;q=0.9,en;q=0.5".parse().unwrap(),
|
|
);
|
|
h.insert(reqwest::header::REFERER, "https://www.bedetheque.com/".parse().unwrap());
|
|
h
|
|
})
|
|
.build()
|
|
.map_err(|e| format!("failed to build HTTP client: {e}"))
|
|
}
|
|
|
|
/// Remove diacritics for URL construction (bedetheque uses ASCII slugs)
|
|
fn normalize_for_url(s: &str) -> String {
|
|
s.chars()
|
|
.map(|c| match c {
|
|
'é' | 'è' | 'ê' | 'ë' | 'É' | 'È' | 'Ê' | 'Ë' => 'e',
|
|
'à' | 'â' | 'ä' | 'À' | 'Â' | 'Ä' => 'a',
|
|
'ù' | 'û' | 'ü' | 'Ù' | 'Û' | 'Ü' => 'u',
|
|
'ô' | 'ö' | 'Ô' | 'Ö' => 'o',
|
|
'î' | 'ï' | 'Î' | 'Ï' => 'i',
|
|
'ç' | 'Ç' => 'c',
|
|
'ñ' | 'Ñ' => 'n',
|
|
_ => c,
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
fn urlencoded(s: &str) -> String {
|
|
let mut result = String::new();
|
|
for byte in s.bytes() {
|
|
match byte {
|
|
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => {
|
|
result.push(byte as char);
|
|
}
|
|
b' ' => result.push('+'),
|
|
_ => result.push_str(&format!("%{:02X}", byte)),
|
|
}
|
|
}
|
|
result
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Search
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async fn search_series_impl(
|
|
query: &str,
|
|
_config: &ProviderConfig,
|
|
) -> Result<Vec<SeriesCandidate>, String> {
|
|
let client = build_client()?;
|
|
|
|
// Use the full-text search page
|
|
let url = format!(
|
|
"https://www.bedetheque.com/search/tout?RechTexte={}&RechWhere=0",
|
|
urlencoded(&normalize_for_url(query))
|
|
);
|
|
|
|
let resp = client
|
|
.get(&url)
|
|
.send()
|
|
.await
|
|
.map_err(|e| format!("Bedetheque request failed: {e}"))?;
|
|
|
|
if !resp.status().is_success() {
|
|
let status = resp.status();
|
|
return Err(format!("Bedetheque returned {status}"));
|
|
}
|
|
|
|
let html = resp
|
|
.text()
|
|
.await
|
|
.map_err(|e| format!("Failed to read Bedetheque response: {e}"))?;
|
|
|
|
// Detect IP blacklist
|
|
if html.contains("<title></title>") || html.contains("<title> </title>") {
|
|
return Err("Bedetheque: IP may be rate-limited, please retry later".to_string());
|
|
}
|
|
|
|
// Parse HTML in a block so the non-Send Html type is dropped before any .await
|
|
let candidates = {
|
|
let document = Html::parse_document(&html);
|
|
let link_sel =
|
|
Selector::parse("a[href*='/serie-']").map_err(|e| format!("selector error: {e}"))?;
|
|
|
|
let query_lower = query.to_lowercase();
|
|
let mut seen = std::collections::HashSet::new();
|
|
let mut candidates = Vec::new();
|
|
|
|
for el in document.select(&link_sel) {
|
|
let href = match el.value().attr("href") {
|
|
Some(h) => h.to_string(),
|
|
None => continue,
|
|
};
|
|
|
|
let (series_id, _slug) = match parse_serie_href(&href) {
|
|
Some(v) => v,
|
|
None => continue,
|
|
};
|
|
|
|
if !seen.insert(series_id.clone()) {
|
|
continue;
|
|
}
|
|
|
|
let title = el.text().collect::<String>().trim().to_string();
|
|
if title.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
let confidence = compute_confidence(&title, &query_lower);
|
|
let cover_url = format!(
|
|
"https://www.bedetheque.com/cache/thb_series/PlancheS_{}.jpg",
|
|
series_id
|
|
);
|
|
|
|
candidates.push(SeriesCandidate {
|
|
external_id: series_id.clone(),
|
|
title: title.clone(),
|
|
authors: vec![],
|
|
description: None,
|
|
publishers: vec![],
|
|
start_year: None,
|
|
total_volumes: None,
|
|
cover_url: Some(cover_url),
|
|
external_url: Some(href),
|
|
confidence,
|
|
metadata_json: serde_json::json!({}),
|
|
});
|
|
}
|
|
|
|
candidates.sort_by(|a, b| {
|
|
b.confidence
|
|
.partial_cmp(&a.confidence)
|
|
.unwrap_or(std::cmp::Ordering::Equal)
|
|
});
|
|
candidates.truncate(10);
|
|
candidates
|
|
}; // document is dropped here — safe to .await below
|
|
|
|
// For the top candidates, fetch series details to enrich metadata
|
|
// (limit to top 3 to avoid hammering the site)
|
|
let mut enriched = Vec::new();
|
|
for mut c in candidates {
|
|
if enriched.len() < 3 {
|
|
if let Ok(details) = fetch_series_details(&client, &c.external_id, c.external_url.as_deref()).await {
|
|
if let Some(desc) = details.description {
|
|
c.description = Some(desc);
|
|
}
|
|
if !details.authors.is_empty() {
|
|
c.authors = details.authors;
|
|
}
|
|
if !details.publishers.is_empty() {
|
|
c.publishers = details.publishers;
|
|
}
|
|
if let Some(year) = details.start_year {
|
|
c.start_year = Some(year);
|
|
}
|
|
if let Some(count) = details.album_count {
|
|
c.total_volumes = Some(count);
|
|
}
|
|
c.metadata_json = serde_json::json!({
|
|
"description": c.description,
|
|
"authors": c.authors,
|
|
"publishers": c.publishers,
|
|
"start_year": c.start_year,
|
|
});
|
|
}
|
|
}
|
|
enriched.push(c);
|
|
}
|
|
|
|
Ok(enriched)
|
|
}
|
|
|
|
/// Parse serie URL to extract (id, slug)
|
|
fn parse_serie_href(href: &str) -> Option<(String, String)> {
|
|
// Patterns:
|
|
// https://www.bedetheque.com/serie-3-BD-Blacksad.html
|
|
// /serie-3-BD-Blacksad.html
|
|
let re = regex::Regex::new(r"/serie-(\d+)-[A-Za-z]+-(.+?)(?:__\d+)?\.html").ok()?;
|
|
let caps = re.captures(href)?;
|
|
Some((caps[1].to_string(), caps[2].to_string()))
|
|
}
|
|
|
|
struct SeriesDetails {
|
|
description: Option<String>,
|
|
authors: Vec<String>,
|
|
publishers: Vec<String>,
|
|
start_year: Option<i32>,
|
|
album_count: Option<i32>,
|
|
}
|
|
|
|
async fn fetch_series_details(
|
|
client: &reqwest::Client,
|
|
series_id: &str,
|
|
series_url: Option<&str>,
|
|
) -> Result<SeriesDetails, String> {
|
|
// Build URL — append __10000 to get all albums on one page
|
|
let url = match series_url {
|
|
Some(u) => {
|
|
// Replace .html with __10000.html
|
|
u.replace(".html", "__10000.html")
|
|
}
|
|
None => format!(
|
|
"https://www.bedetheque.com/serie-{}-BD-Serie__10000.html",
|
|
series_id
|
|
),
|
|
};
|
|
|
|
let resp = client
|
|
.get(&url)
|
|
.send()
|
|
.await
|
|
.map_err(|e| format!("Failed to fetch series page: {e}"))?;
|
|
|
|
if !resp.status().is_success() {
|
|
return Err(format!("Series page returned {}", resp.status()));
|
|
}
|
|
|
|
let html = resp
|
|
.text()
|
|
.await
|
|
.map_err(|e| format!("Failed to read series page: {e}"))?;
|
|
|
|
let doc = Html::parse_document(&html);
|
|
let mut details = SeriesDetails {
|
|
description: None,
|
|
authors: vec![],
|
|
publishers: vec![],
|
|
start_year: None,
|
|
album_count: None,
|
|
};
|
|
|
|
// Description: look for #full-commentaire or .serie-info
|
|
if let Ok(sel) = Selector::parse("#full-commentaire") {
|
|
if let Some(el) = doc.select(&sel).next() {
|
|
let text = el.text().collect::<String>().trim().to_string();
|
|
if !text.is_empty() {
|
|
details.description = Some(text);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback description from span.infoedition
|
|
if details.description.is_none() {
|
|
if let Ok(sel) = Selector::parse("span.infoedition") {
|
|
if let Some(el) = doc.select(&sel).next() {
|
|
let text = el.text().collect::<String>().trim().to_string();
|
|
if !text.is_empty() {
|
|
details.description = Some(text);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract authors and publishers from album info blocks
|
|
if let Ok(sel) = Selector::parse(".infos li") {
|
|
let mut authors_set = std::collections::HashSet::new();
|
|
let mut publishers_set = std::collections::HashSet::new();
|
|
|
|
for li in doc.select(&sel) {
|
|
let text = li.text().collect::<String>();
|
|
let text = text.trim();
|
|
|
|
if let Some(val) = extract_info_value(text, "Scénario") {
|
|
for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
|
|
authors_set.insert(a.to_string());
|
|
}
|
|
}
|
|
if let Some(val) = extract_info_value(text, "Dessin") {
|
|
for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
|
|
authors_set.insert(a.to_string());
|
|
}
|
|
}
|
|
if let Some(val) = extract_info_value(text, "Editeur") {
|
|
for p in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
|
|
publishers_set.insert(p.to_string());
|
|
}
|
|
}
|
|
}
|
|
|
|
details.authors = authors_set.into_iter().collect();
|
|
details.authors.sort();
|
|
details.publishers = publishers_set.into_iter().collect();
|
|
details.publishers.sort();
|
|
}
|
|
|
|
// Album count from serie-info text (e.g. "Tomes : 8")
|
|
let page_text = doc.root_element().text().collect::<String>();
|
|
if let Ok(re) = regex::Regex::new(r"Tomes?\s*:\s*(\d+)") {
|
|
if let Some(caps) = re.captures(&page_text) {
|
|
if let Ok(n) = caps[1].parse::<i32>() {
|
|
details.album_count = Some(n);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Start year from first album date (Dépot légal)
|
|
if let Ok(re) = regex::Regex::new(r"[Dd][ée]p[ôo]t l[ée]gal\s*:\s*\d{2}/(\d{4})") {
|
|
if let Some(caps) = re.captures(&page_text) {
|
|
if let Ok(year) = caps[1].parse::<i32>() {
|
|
details.start_year = Some(year);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(details)
|
|
}
|
|
|
|
/// Extract value after a label like "Scénario : Jean-Claude" → "Jean-Claude"
|
|
fn extract_info_value<'a>(text: &'a str, label: &str) -> Option<&'a str> {
|
|
// Handle both "Label :" and "Label:"
|
|
let patterns = [
|
|
format!("{} :", label),
|
|
format!("{}:", label),
|
|
format!("{} :", &label.to_lowercase()),
|
|
];
|
|
for pat in &patterns {
|
|
if let Some(pos) = text.find(pat.as_str()) {
|
|
let val = text[pos + pat.len()..].trim();
|
|
if !val.is_empty() {
|
|
return Some(val);
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Get series books
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async fn get_series_books_impl(
|
|
external_id: &str,
|
|
_config: &ProviderConfig,
|
|
) -> Result<Vec<BookCandidate>, String> {
|
|
let client = build_client()?;
|
|
|
|
// We need to find the series URL — try a direct fetch
|
|
// external_id is the numeric series ID
|
|
// We try to fetch the series page to get the album list
|
|
let url = format!(
|
|
"https://www.bedetheque.com/serie-{}-BD-Serie__10000.html",
|
|
external_id
|
|
);
|
|
|
|
let resp = client
|
|
.get(&url)
|
|
.send()
|
|
.await
|
|
.map_err(|e| format!("Failed to fetch series: {e}"))?;
|
|
|
|
// If the generic slug fails, try without the slug part (bedetheque redirects)
|
|
let html = if resp.status().is_success() {
|
|
resp.text().await.map_err(|e| format!("Failed to read: {e}"))?
|
|
} else {
|
|
// Try alternative URL pattern
|
|
let alt_url = format!(
|
|
"https://www.bedetheque.com/serie-{}__10000.html",
|
|
external_id
|
|
);
|
|
let resp2 = client
|
|
.get(&alt_url)
|
|
.send()
|
|
.await
|
|
.map_err(|e| format!("Failed to fetch series (alt): {e}"))?;
|
|
if !resp2.status().is_success() {
|
|
return Err(format!("Series page not found for id {external_id}"));
|
|
}
|
|
resp2.text().await.map_err(|e| format!("Failed to read: {e}"))?
|
|
};
|
|
|
|
if html.contains("<title></title>") {
|
|
return Err("Bedetheque: IP may be rate-limited".to_string());
|
|
}
|
|
|
|
let doc = Html::parse_document(&html);
|
|
let mut books = Vec::new();
|
|
|
|
// Albums are in .album-main blocks
|
|
let album_sel = Selector::parse(".album-main").map_err(|e| format!("selector: {e}"))?;
|
|
|
|
for album_el in doc.select(&album_sel) {
|
|
let album_html = album_el.html();
|
|
let album_doc = Html::parse_fragment(&album_html);
|
|
|
|
// Title from .titre
|
|
let title = select_text(&album_doc, ".titre")
|
|
.or_else(|| {
|
|
Selector::parse(".titre a")
|
|
.ok()
|
|
.and_then(|s| album_doc.select(&s).next())
|
|
.map(|el| el.text().collect::<String>().trim().to_string())
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
if title.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
// Volume number from title or .num span
|
|
let volume_number = select_text(&album_doc, ".num")
|
|
.and_then(|s| {
|
|
s.trim_end_matches('.')
|
|
.trim()
|
|
.parse::<i32>()
|
|
.ok()
|
|
})
|
|
.or_else(|| extract_volume_from_title(&title));
|
|
|
|
// Album URL
|
|
let album_url = Selector::parse("a[href*='/BD-']")
|
|
.ok()
|
|
.and_then(|s| album_doc.select(&s).next())
|
|
.and_then(|el| el.value().attr("href"))
|
|
.map(String::from);
|
|
|
|
// External book id from URL
|
|
let external_book_id = album_url
|
|
.as_deref()
|
|
.and_then(|u| {
|
|
regex::Regex::new(r"-(\d+)\.html")
|
|
.ok()
|
|
.and_then(|re| re.captures(u))
|
|
.map(|c| c[1].to_string())
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
// Cover
|
|
let cover_url = Selector::parse("img[src*='cache/thb_couv']")
|
|
.ok()
|
|
.and_then(|s| album_doc.select(&s).next())
|
|
.and_then(|el| el.value().attr("src"))
|
|
.map(|s| {
|
|
if s.starts_with("http") {
|
|
s.to_string()
|
|
} else {
|
|
format!("https://www.bedetheque.com{}", s)
|
|
}
|
|
});
|
|
|
|
// Extract info fields
|
|
let album_text = album_el.text().collect::<String>();
|
|
let authors = extract_all_authors(&album_text);
|
|
let isbn = extract_info_value(&album_text, "EAN/ISBN")
|
|
.or_else(|| extract_info_value(&album_text, "ISBN"))
|
|
.map(|s| s.trim().to_string());
|
|
let page_count = extract_info_value(&album_text, "Planches")
|
|
.and_then(|s| s.trim().parse::<i32>().ok());
|
|
let publish_date = extract_info_value(&album_text, "Dépot légal")
|
|
.or_else(|| extract_info_value(&album_text, "Depot legal"))
|
|
.map(|s| s.trim().to_string());
|
|
|
|
books.push(BookCandidate {
|
|
external_book_id,
|
|
title,
|
|
volume_number,
|
|
authors,
|
|
isbn,
|
|
summary: None,
|
|
cover_url,
|
|
page_count,
|
|
language: Some("fr".to_string()),
|
|
publish_date,
|
|
metadata_json: serde_json::json!({}),
|
|
});
|
|
}
|
|
|
|
books.sort_by_key(|b| b.volume_number.unwrap_or(999));
|
|
Ok(books)
|
|
}
|
|
|
|
fn select_text(doc: &Html, selector: &str) -> Option<String> {
|
|
Selector::parse(selector)
|
|
.ok()
|
|
.and_then(|s| doc.select(&s).next())
|
|
.map(|el| el.text().collect::<String>().trim().to_string())
|
|
.filter(|s| !s.is_empty())
|
|
}
|
|
|
|
fn extract_all_authors(text: &str) -> Vec<String> {
|
|
let mut authors = Vec::new();
|
|
for label in ["Scénario", "Scenario", "Dessin"] {
|
|
if let Some(val) = extract_info_value(text, label) {
|
|
for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
|
|
if !authors.contains(&a.to_string()) {
|
|
authors.push(a.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
authors
|
|
}
|
|
|
|
fn extract_volume_from_title(title: &str) -> Option<i32> {
|
|
let patterns = [
|
|
r"(?i)(?:tome|t\.)\s*(\d+)",
|
|
r"(?i)(?:vol(?:ume)?\.?)\s*(\d+)",
|
|
r"#\s*(\d+)",
|
|
];
|
|
for pattern in &patterns {
|
|
if let Ok(re) = regex::Regex::new(pattern) {
|
|
if let Some(caps) = re.captures(title) {
|
|
if let Ok(n) = caps[1].parse::<i32>() {
|
|
return Some(n);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
fn compute_confidence(title: &str, query: &str) -> f32 {
|
|
let title_lower = title.to_lowercase();
|
|
if title_lower == query {
|
|
1.0
|
|
} else if title_lower.starts_with(query) || query.starts_with(&title_lower) {
|
|
0.85
|
|
} else if title_lower.contains(query) || query.contains(&title_lower) {
|
|
0.7
|
|
} else {
|
|
let common: usize = query
|
|
.chars()
|
|
.filter(|c| title_lower.contains(*c))
|
|
.count();
|
|
let max_len = query.len().max(title_lower.len()).max(1);
|
|
(common as f32 / max_len as f32).clamp(0.1, 0.6)
|
|
}
|
|
}
|