feat: add series status, improve providers & e2e tests

- Add series status concept (ongoing/ended/hiatus/cancelled/upcoming)
  with normalization across all providers
- Add status field to series_metadata table (migration 0033)
- AniList: use chapters as fallback for volume count on ongoing series,
  add books_message when both volumes and chapters are null
- Bedetheque: extract description from meta tag, genres, parution status,
  origin/language; rewrite book parsing with itemprop microdata for
  clean ISBN, dates, page counts, covers; filter placeholder authors
- Add comprehensive e2e provider tests with field coverage reporting
- Wire status into EditSeriesForm, MetadataSearchModal, and series page

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-18 16:10:45 +01:00
parent 51ef2fa725
commit 52b9b0e00e
10 changed files with 566 additions and 156 deletions

View File

@@ -210,6 +210,10 @@ async fn search_series_impl(
"authors": c.authors,
"publishers": c.publishers,
"start_year": c.start_year,
"genres": details.genres,
"status": details.status,
"origin": details.origin,
"language": details.language,
});
}
}
@@ -235,6 +239,10 @@ struct SeriesDetails {
publishers: Vec<String>,
start_year: Option<i32>,
album_count: Option<i32>,
genres: Vec<String>,
status: Option<String>,
origin: Option<String>,
language: Option<String>,
}
async fn fetch_series_details(
@@ -276,64 +284,109 @@ async fn fetch_series_details(
publishers: vec![],
start_year: None,
album_count: None,
genres: vec![],
status: None,
origin: None,
language: None,
};
// Description: look for #full-commentaire or .serie-info
if let Ok(sel) = Selector::parse("#full-commentaire") {
// Description from <meta name="description"> — format: "Tout sur la série {name} : {description}"
if let Ok(sel) = Selector::parse(r#"meta[name="description"]"#) {
if let Some(el) = doc.select(&sel).next() {
let text = el.text().collect::<String>().trim().to_string();
if !text.is_empty() {
details.description = Some(text);
}
}
}
// Fallback description from span.infoedition
if details.description.is_none() {
if let Ok(sel) = Selector::parse("span.infoedition") {
if let Some(el) = doc.select(&sel).next() {
let text = el.text().collect::<String>().trim().to_string();
if !text.is_empty() {
details.description = Some(text);
if let Some(content) = el.value().attr("content") {
let desc = content.trim().to_string();
// Strip the "Tout sur la série ... : " prefix
let cleaned = if let Some(pos) = desc.find(" : ") {
desc[pos + 3..].trim().to_string()
} else {
desc
};
if !cleaned.is_empty() {
details.description = Some(cleaned);
}
}
}
}
// Extract authors and publishers from album info blocks
if let Ok(sel) = Selector::parse(".infos li") {
// Extract authors from itemprop="author" and itemprop="illustrator" (deduplicated)
{
let mut authors_set = std::collections::HashSet::new();
let mut publishers_set = std::collections::HashSet::new();
for li in doc.select(&sel) {
let text = li.text().collect::<String>();
let text = text.trim();
if let Some(val) = extract_info_value(text, "Scénario") {
for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
authors_set.insert(a.to_string());
}
}
if let Some(val) = extract_info_value(text, "Dessin") {
for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
authors_set.insert(a.to_string());
}
}
if let Some(val) = extract_info_value(text, "Editeur") {
for p in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
publishers_set.insert(p.to_string());
for attr in ["author", "illustrator"] {
if let Ok(sel) = Selector::parse(&format!(r#"[itemprop="{attr}"]"#)) {
for el in doc.select(&sel) {
let name = el.text().collect::<String>().trim().to_string();
// Names are "Last, First" — normalize to "First Last"
let normalized = if let Some((last, first)) = name.split_once(',') {
format!("{} {}", first.trim(), last.trim())
} else {
name
};
if !normalized.is_empty() && is_real_author(&normalized) {
authors_set.insert(normalized);
}
}
}
}
details.authors = authors_set.into_iter().collect();
details.authors.sort();
}
// Extract publishers from itemprop="publisher" (deduplicated)
{
let mut publishers_set = std::collections::HashSet::new();
if let Ok(sel) = Selector::parse(r#"[itemprop="publisher"]"#) {
for el in doc.select(&sel) {
let name = el.text().collect::<String>().trim().to_string();
if !name.is_empty() {
publishers_set.insert(name);
}
}
}
details.publishers = publishers_set.into_iter().collect();
details.publishers.sort();
}
// Album count from serie-info text (e.g. "Tomes : 8")
// Extract series-level info from <li><label>X :</label>value</li> blocks
// Genre: <li><label>Genre :</label><span class="style-serie">Animalier, Aventure, Humour</span></li>
if let Ok(sel) = Selector::parse("span.style-serie") {
if let Some(el) = doc.select(&sel).next() {
let text = el.text().collect::<String>();
details.genres = text
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
}
}
// Parution: <li><label>Parution :</label><span class="parution-serie">Série finie</span></li>
if let Ok(sel) = Selector::parse("span.parution-serie") {
if let Some(el) = doc.select(&sel).next() {
let text = el.text().collect::<String>().trim().to_string();
if !text.is_empty() {
details.status = Some(text);
}
}
}
// Origine and Langue from page text (no dedicated CSS class)
let page_text = doc.root_element().text().collect::<String>();
if let Some(val) = extract_info_value(&page_text, "Origine") {
let val = val.lines().next().unwrap_or(val).trim();
if !val.is_empty() {
details.origin = Some(val.to_string());
}
}
if let Some(val) = extract_info_value(&page_text, "Langue") {
let val = val.lines().next().unwrap_or(val).trim();
if !val.is_empty() {
details.language = Some(val.to_string());
}
}
// Album count from serie-info text (e.g. "Tomes : 8")
if let Ok(re) = regex::Regex::new(r"Tomes?\s*:\s*(\d+)") {
if let Some(caps) = re.captures(&page_text) {
if let Ok(n) = caps[1].parse::<i32>() {
@@ -342,11 +395,16 @@ async fn fetch_series_details(
}
}
// Start year from first album date (Dépot légal)
if let Ok(re) = regex::Regex::new(r"[Dd][ée]p[ôo]t l[ée]gal\s*:\s*\d{2}/(\d{4})") {
if let Some(caps) = re.captures(&page_text) {
if let Ok(year) = caps[1].parse::<i32>() {
details.start_year = Some(year);
// Start year from first <meta itemprop="datePublished" content="YYYY-MM-DD">
if let Ok(sel) = Selector::parse(r#"[itemprop="datePublished"]"#) {
if let Some(el) = doc.select(&sel).next() {
if let Some(content) = el.value().attr("content") {
// content is "YYYY-MM-DD"
if let Some(year_str) = content.split('-').next() {
if let Ok(year) = year_str.parse::<i32>() {
details.start_year = Some(year);
}
}
}
}
}
@@ -424,79 +482,91 @@ async fn get_series_books_impl(
let doc = Html::parse_document(&html);
let mut books = Vec::new();
// Albums are in .album-main blocks
// Each album block starts before a .album-main div.
// The cover image (<img itemprop="image">) is OUTSIDE .album-main (sibling),
// so we iterate over a broader parent. But the simplest approach: parse all
// itemprop elements relative to each .album-main, plus pick covers separately.
let album_sel = Selector::parse(".album-main").map_err(|e| format!("selector: {e}"))?;
for album_el in doc.select(&album_sel) {
let album_html = album_el.html();
let album_doc = Html::parse_fragment(&album_html);
// Pre-collect cover images — they appear in <img itemprop="image"> before each .album-main
// and link to an album URL containing the book ID
let cover_sel = Selector::parse(r#"img[itemprop="image"]"#).map_err(|e| format!("selector: {e}"))?;
let covers: Vec<String> = doc.select(&cover_sel)
.filter_map(|el| el.value().attr("src").map(|s| {
if s.starts_with("http") { s.to_string() } else { format!("https://www.bedetheque.com{}", s) }
}))
.collect();
// Title from .titre
let title = select_text(&album_doc, ".titre")
.or_else(|| {
Selector::parse(".titre a")
.ok()
.and_then(|s| album_doc.select(&s).next())
.map(|el| el.text().collect::<String>().trim().to_string())
})
.unwrap_or_default();
for (idx, album_el) in doc.select(&album_sel).enumerate() {
// Title from <a class="titre" title="..."> — the title attribute is clean
let title_sel = Selector::parse("a.titre").ok();
let title_el = title_sel.as_ref().and_then(|s| album_el.select(s).next());
let title = title_el
.and_then(|el| el.value().attr("title"))
.unwrap_or("")
.trim()
.to_string();
if title.is_empty() {
continue;
}
// Volume number from title or .num span
let volume_number = select_text(&album_doc, ".num")
.and_then(|s| {
s.trim_end_matches('.')
.trim()
.parse::<i32>()
.ok()
})
.or_else(|| extract_volume_from_title(&title));
// Album URL
let album_url = Selector::parse("a[href*='/BD-']")
// External book ID from album URL (e.g. "...-1063.html")
let album_url = title_el.and_then(|el| el.value().attr("href")).unwrap_or("");
let external_book_id = regex::Regex::new(r"-(\d+)\.html")
.ok()
.and_then(|s| album_doc.select(&s).next())
.and_then(|el| el.value().attr("href"))
.map(String::from);
// External book id from URL
let external_book_id = album_url
.as_deref()
.and_then(|u| {
regex::Regex::new(r"-(\d+)\.html")
.ok()
.and_then(|re| re.captures(u))
.map(|c| c[1].to_string())
})
.and_then(|re| re.captures(album_url))
.map(|c| c[1].to_string())
.unwrap_or_default();
// Cover
let cover_url = Selector::parse("img[src*='cache/thb_couv']")
// Volume number from URL pattern "Tome-{N}-" or from itemprop name
let volume_number = regex::Regex::new(r"(?i)Tome-(\d+)-")
.ok()
.and_then(|s| album_doc.select(&s).next())
.and_then(|el| el.value().attr("src"))
.map(|s| {
if s.starts_with("http") {
s.to_string()
} else {
format!("https://www.bedetheque.com{}", s)
}
});
.and_then(|re| re.captures(album_url))
.and_then(|c| c[1].parse::<i32>().ok())
.or_else(|| extract_volume_from_title(&title));
// Extract info fields
let album_text = album_el.text().collect::<String>();
let authors = extract_all_authors(&album_text);
let isbn = extract_info_value(&album_text, "EAN/ISBN")
.or_else(|| extract_info_value(&album_text, "ISBN"))
.map(|s| s.trim().to_string());
let page_count = extract_info_value(&album_text, "Planches")
.and_then(|s| s.trim().parse::<i32>().ok());
let publish_date = extract_info_value(&album_text, "Dépot légal")
.or_else(|| extract_info_value(&album_text, "Depot legal"))
.map(|s| s.trim().to_string());
// Authors from itemprop="author" and itemprop="illustrator"
let mut authors = Vec::new();
let author_sel = Selector::parse(r#"[itemprop="author"]"#).ok();
let illustrator_sel = Selector::parse(r#"[itemprop="illustrator"]"#).ok();
for sel in [&author_sel, &illustrator_sel].into_iter().flatten() {
for el in album_el.select(sel) {
let name = el.text().collect::<String>().trim().to_string();
// Names are "Last, First" format — normalize to "First Last"
let normalized = if let Some((last, first)) = name.split_once(',') {
format!("{} {}", first.trim(), last.trim())
} else {
name
};
if !normalized.is_empty() && is_real_author(&normalized) && !authors.contains(&normalized) {
authors.push(normalized);
}
}
}
// ISBN from <span itemprop="isbn">
let isbn = Selector::parse(r#"[itemprop="isbn"]"#)
.ok()
.and_then(|s| album_el.select(&s).next())
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty());
// Page count from <span itemprop="numberOfPages">
let page_count = Selector::parse(r#"[itemprop="numberOfPages"]"#)
.ok()
.and_then(|s| album_el.select(&s).next())
.and_then(|el| el.text().collect::<String>().trim().parse::<i32>().ok());
// Publish date from <meta itemprop="datePublished" content="YYYY-MM-DD">
let publish_date = Selector::parse(r#"[itemprop="datePublished"]"#)
.ok()
.and_then(|s| album_el.select(&s).next())
.and_then(|el| el.value().attr("content").map(|c| c.trim().to_string()))
.filter(|s| !s.is_empty());
// Cover from pre-collected covers (same index)
let cover_url = covers.get(idx).cloned();
books.push(BookCandidate {
external_book_id,
@@ -517,26 +587,9 @@ async fn get_series_books_impl(
Ok(books)
}
fn select_text(doc: &Html, selector: &str) -> Option<String> {
Selector::parse(selector)
.ok()
.and_then(|s| doc.select(&s).next())
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty())
}
fn extract_all_authors(text: &str) -> Vec<String> {
let mut authors = Vec::new();
for label in ["Scénario", "Scenario", "Dessin"] {
if let Some(val) = extract_info_value(text, label) {
for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
if !authors.contains(&a.to_string()) {
authors.push(a.to_string());
}
}
}
}
authors
/// Filter out placeholder author names from Bédéthèque
fn is_real_author(name: &str) -> bool {
!name.starts_with('<') && !name.ends_with('>') && name != "Collectif"
}
fn extract_volume_from_title(title: &str) -> Option<i32> {