feat: add series status, improve providers & e2e tests

- Add series status concept (ongoing/ended/hiatus/cancelled/upcoming) with normalization across all providers - Add status field to series_metadata table (migration 0033) - AniList: use chapters as fallback for volume count on ongoing series, add books_message when both volumes and chapters are null - Bedetheque: extract description from meta tag, genres, parution status, origin/language; rewrite book parsing with itemprop microdata for clean ISBN, dates, page counts, covers; filter placeholder authors - Add comprehensive e2e provider tests with field coverage reporting - Wire status into EditSeriesForm, MetadataSearchModal, and series page Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 16:10:45 +01:00
parent 51ef2fa725
commit 52b9b0e00e
10 changed files with 566 additions and 156 deletions
--- a/apps/api/src/metadata_providers/bedetheque.rs
+++ b/apps/api/src/metadata_providers/bedetheque.rs
@@ -210,6 +210,10 @@ async fn search_series_impl(
                    "authors": c.authors,
                    "publishers": c.publishers,
                    "start_year": c.start_year,
+                    "genres": details.genres,
+                    "status": details.status,
+                    "origin": details.origin,
+                    "language": details.language,
                });
            }
        }
@@ -235,6 +239,10 @@ struct SeriesDetails {
    publishers: Vec<String>,
    start_year: Option<i32>,
    album_count: Option<i32>,
+    genres: Vec<String>,
+    status: Option<String>,
+    origin: Option<String>,
+    language: Option<String>,
 }

 async fn fetch_series_details(
@@ -276,64 +284,109 @@ async fn fetch_series_details(
        publishers: vec![],
        start_year: None,
        album_count: None,
+        genres: vec![],
+        status: None,
+        origin: None,
+        language: None,
    };

-    // Description: look for #full-commentaire or .serie-info
-    if let Ok(sel) = Selector::parse("#full-commentaire") {
+    // Description from <meta name="description"> — format: "Tout sur la série {name} : {description}"
+    if let Ok(sel) = Selector::parse(r#"meta[name="description"]"#) {
        if let Some(el) = doc.select(&sel).next() {
-            let text = el.text().collect::<String>().trim().to_string();
-            if !text.is_empty() {
-                details.description = Some(text);
-            }
-        }
-    }
-
-    // Fallback description from span.infoedition
-    if details.description.is_none() {
-        if let Ok(sel) = Selector::parse("span.infoedition") {
-            if let Some(el) = doc.select(&sel).next() {
-                let text = el.text().collect::<String>().trim().to_string();
-                if !text.is_empty() {
-                    details.description = Some(text);
+            if let Some(content) = el.value().attr("content") {
+                let desc = content.trim().to_string();
+                // Strip the "Tout sur la série ... : " prefix
+                let cleaned = if let Some(pos) = desc.find(" : ") {
+                    desc[pos + 3..].trim().to_string()
+                } else {
+                    desc
+                };
+                if !cleaned.is_empty() {
+                    details.description = Some(cleaned);
                }
            }
        }
    }

-    // Extract authors and publishers from album info blocks
-    if let Ok(sel) = Selector::parse(".infos li") {
+    // Extract authors from itemprop="author" and itemprop="illustrator" (deduplicated)
+    {
        let mut authors_set = std::collections::HashSet::new();
-        let mut publishers_set = std::collections::HashSet::new();
-
-        for li in doc.select(&sel) {
-            let text = li.text().collect::<String>();
-            let text = text.trim();
-
-            if let Some(val) = extract_info_value(text, "Scénario") {
-                for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
-                    authors_set.insert(a.to_string());
-                }
-            }
-            if let Some(val) = extract_info_value(text, "Dessin") {
-                for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
-                    authors_set.insert(a.to_string());
-                }
-            }
-            if let Some(val) = extract_info_value(text, "Editeur") {
-                for p in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
-                    publishers_set.insert(p.to_string());
+        for attr in ["author", "illustrator"] {
+            if let Ok(sel) = Selector::parse(&format!(r#"[itemprop="{attr}"]"#)) {
+                for el in doc.select(&sel) {
+                    let name = el.text().collect::<String>().trim().to_string();
+                    // Names are "Last, First" — normalize to "First Last"
+                    let normalized = if let Some((last, first)) = name.split_once(',') {
+                        format!("{} {}", first.trim(), last.trim())
+                    } else {
+                        name
+                    };
+                    if !normalized.is_empty() && is_real_author(&normalized) {
+                        authors_set.insert(normalized);
+                    }
                }
            }
        }
-
        details.authors = authors_set.into_iter().collect();
        details.authors.sort();
+    }
+
+    // Extract publishers from itemprop="publisher" (deduplicated)
+    {
+        let mut publishers_set = std::collections::HashSet::new();
+        if let Ok(sel) = Selector::parse(r#"[itemprop="publisher"]"#) {
+            for el in doc.select(&sel) {
+                let name = el.text().collect::<String>().trim().to_string();
+                if !name.is_empty() {
+                    publishers_set.insert(name);
+                }
+            }
+        }
        details.publishers = publishers_set.into_iter().collect();
        details.publishers.sort();
    }

-    // Album count from serie-info text (e.g. "Tomes : 8")
+    // Extract series-level info from <li><label>X :</label>value</li> blocks
+    // Genre: <li><label>Genre :</label><span class="style-serie">Animalier, Aventure, Humour</span></li>
+    if let Ok(sel) = Selector::parse("span.style-serie") {
+        if let Some(el) = doc.select(&sel).next() {
+            let text = el.text().collect::<String>();
+            details.genres = text
+                .split(',')
+                .map(|s| s.trim().to_string())
+                .filter(|s| !s.is_empty())
+                .collect();
+        }
+    }
+
+    // Parution: <li><label>Parution :</label><span class="parution-serie">Série finie</span></li>
+    if let Ok(sel) = Selector::parse("span.parution-serie") {
+        if let Some(el) = doc.select(&sel).next() {
+            let text = el.text().collect::<String>().trim().to_string();
+            if !text.is_empty() {
+                details.status = Some(text);
+            }
+        }
+    }
+
+    // Origine and Langue from page text (no dedicated CSS class)
    let page_text = doc.root_element().text().collect::<String>();
+
+    if let Some(val) = extract_info_value(&page_text, "Origine") {
+        let val = val.lines().next().unwrap_or(val).trim();
+        if !val.is_empty() {
+            details.origin = Some(val.to_string());
+        }
+    }
+
+    if let Some(val) = extract_info_value(&page_text, "Langue") {
+        let val = val.lines().next().unwrap_or(val).trim();
+        if !val.is_empty() {
+            details.language = Some(val.to_string());
+        }
+    }
+
+    // Album count from serie-info text (e.g. "Tomes : 8")
    if let Ok(re) = regex::Regex::new(r"Tomes?\s*:\s*(\d+)") {
        if let Some(caps) = re.captures(&page_text) {
            if let Ok(n) = caps[1].parse::<i32>() {
@@ -342,11 +395,16 @@ async fn fetch_series_details(
        }
    }

-    // Start year from first album date (Dépot légal)
-    if let Ok(re) = regex::Regex::new(r"[Dd][ée]p[ôo]t l[ée]gal\s*:\s*\d{2}/(\d{4})") {
-        if let Some(caps) = re.captures(&page_text) {
-            if let Ok(year) = caps[1].parse::<i32>() {
-                details.start_year = Some(year);
+    // Start year from first <meta itemprop="datePublished" content="YYYY-MM-DD">
+    if let Ok(sel) = Selector::parse(r#"[itemprop="datePublished"]"#) {
+        if let Some(el) = doc.select(&sel).next() {
+            if let Some(content) = el.value().attr("content") {
+                // content is "YYYY-MM-DD"
+                if let Some(year_str) = content.split('-').next() {
+                    if let Ok(year) = year_str.parse::<i32>() {
+                        details.start_year = Some(year);
+                    }
+                }
            }
        }
    }
@@ -424,79 +482,91 @@ async fn get_series_books_impl(
    let doc = Html::parse_document(&html);
    let mut books = Vec::new();

-    // Albums are in .album-main blocks
+    // Each album block starts before a .album-main div.
+    // The cover image (<img itemprop="image">) is OUTSIDE .album-main (sibling),
+    // so we iterate over a broader parent. But the simplest approach: parse all
+    // itemprop elements relative to each .album-main, plus pick covers separately.
    let album_sel = Selector::parse(".album-main").map_err(|e| format!("selector: {e}"))?;

-    for album_el in doc.select(&album_sel) {
-        let album_html = album_el.html();
-        let album_doc = Html::parse_fragment(&album_html);
+    // Pre-collect cover images — they appear in <img itemprop="image"> before each .album-main
+    // and link to an album URL containing the book ID
+    let cover_sel = Selector::parse(r#"img[itemprop="image"]"#).map_err(|e| format!("selector: {e}"))?;
+    let covers: Vec<String> = doc.select(&cover_sel)
+        .filter_map(|el| el.value().attr("src").map(|s| {
+            if s.starts_with("http") { s.to_string() } else { format!("https://www.bedetheque.com{}", s) }
+        }))
+        .collect();

-        // Title from .titre
-        let title = select_text(&album_doc, ".titre")
-            .or_else(|| {
-                Selector::parse(".titre a")
-                    .ok()
-                    .and_then(|s| album_doc.select(&s).next())
-                    .map(|el| el.text().collect::<String>().trim().to_string())
-            })
-            .unwrap_or_default();
+    for (idx, album_el) in doc.select(&album_sel).enumerate() {
+        // Title from <a class="titre" title="..."> — the title attribute is clean
+        let title_sel = Selector::parse("a.titre").ok();
+        let title_el = title_sel.as_ref().and_then(|s| album_el.select(s).next());
+        let title = title_el
+            .and_then(|el| el.value().attr("title"))
+            .unwrap_or("")
+            .trim()
+            .to_string();

        if title.is_empty() {
            continue;
        }

-        // Volume number from title or .num span
-        let volume_number = select_text(&album_doc, ".num")
-            .and_then(|s| {
-                s.trim_end_matches('.')
-                    .trim()
-                    .parse::<i32>()
-                    .ok()
-            })
-            .or_else(|| extract_volume_from_title(&title));
-
-        // Album URL
-        let album_url = Selector::parse("a[href*='/BD-']")
+        // External book ID from album URL (e.g. "...-1063.html")
+        let album_url = title_el.and_then(|el| el.value().attr("href")).unwrap_or("");
+        let external_book_id = regex::Regex::new(r"-(\d+)\.html")
            .ok()
-            .and_then(|s| album_doc.select(&s).next())
-            .and_then(|el| el.value().attr("href"))
-            .map(String::from);
-
-        // External book id from URL
-        let external_book_id = album_url
-            .as_deref()
-            .and_then(|u| {
-                regex::Regex::new(r"-(\d+)\.html")
-                    .ok()
-                    .and_then(|re| re.captures(u))
-                    .map(|c| c[1].to_string())
-            })
+            .and_then(|re| re.captures(album_url))
+            .map(|c| c[1].to_string())
            .unwrap_or_default();

-        // Cover
-        let cover_url = Selector::parse("img[src*='cache/thb_couv']")
+        // Volume number from URL pattern "Tome-{N}-" or from itemprop name
+        let volume_number = regex::Regex::new(r"(?i)Tome-(\d+)-")
            .ok()
-            .and_then(|s| album_doc.select(&s).next())
-            .and_then(|el| el.value().attr("src"))
-            .map(|s| {
-                if s.starts_with("http") {
-                    s.to_string()
-                } else {
-                    format!("https://www.bedetheque.com{}", s)
-                }
-            });
+            .and_then(|re| re.captures(album_url))
+            .and_then(|c| c[1].parse::<i32>().ok())
+            .or_else(|| extract_volume_from_title(&title));

-        // Extract info fields
-        let album_text = album_el.text().collect::<String>();
-        let authors = extract_all_authors(&album_text);
-        let isbn = extract_info_value(&album_text, "EAN/ISBN")
-            .or_else(|| extract_info_value(&album_text, "ISBN"))
-            .map(|s| s.trim().to_string());
-        let page_count = extract_info_value(&album_text, "Planches")
-            .and_then(|s| s.trim().parse::<i32>().ok());
-        let publish_date = extract_info_value(&album_text, "Dépot légal")
-            .or_else(|| extract_info_value(&album_text, "Depot legal"))
-            .map(|s| s.trim().to_string());
+        // Authors from itemprop="author" and itemprop="illustrator"
+        let mut authors = Vec::new();
+        let author_sel = Selector::parse(r#"[itemprop="author"]"#).ok();
+        let illustrator_sel = Selector::parse(r#"[itemprop="illustrator"]"#).ok();
+        for sel in [&author_sel, &illustrator_sel].into_iter().flatten() {
+            for el in album_el.select(sel) {
+                let name = el.text().collect::<String>().trim().to_string();
+                // Names are "Last, First" format — normalize to "First Last"
+                let normalized = if let Some((last, first)) = name.split_once(',') {
+                    format!("{} {}", first.trim(), last.trim())
+                } else {
+                    name
+                };
+                if !normalized.is_empty() && is_real_author(&normalized) && !authors.contains(&normalized) {
+                    authors.push(normalized);
+                }
+            }
+        }
+
+        // ISBN from <span itemprop="isbn">
+        let isbn = Selector::parse(r#"[itemprop="isbn"]"#)
+            .ok()
+            .and_then(|s| album_el.select(&s).next())
+            .map(|el| el.text().collect::<String>().trim().to_string())
+            .filter(|s| !s.is_empty());
+
+        // Page count from <span itemprop="numberOfPages">
+        let page_count = Selector::parse(r#"[itemprop="numberOfPages"]"#)
+            .ok()
+            .and_then(|s| album_el.select(&s).next())
+            .and_then(|el| el.text().collect::<String>().trim().parse::<i32>().ok());
+
+        // Publish date from <meta itemprop="datePublished" content="YYYY-MM-DD">
+        let publish_date = Selector::parse(r#"[itemprop="datePublished"]"#)
+            .ok()
+            .and_then(|s| album_el.select(&s).next())
+            .and_then(|el| el.value().attr("content").map(|c| c.trim().to_string()))
+            .filter(|s| !s.is_empty());
+
+        // Cover from pre-collected covers (same index)
+        let cover_url = covers.get(idx).cloned();

        books.push(BookCandidate {
            external_book_id,
@@ -517,26 +587,9 @@ async fn get_series_books_impl(
    Ok(books)
 }

-fn select_text(doc: &Html, selector: &str) -> Option<String> {
-    Selector::parse(selector)
-        .ok()
-        .and_then(|s| doc.select(&s).next())
-        .map(|el| el.text().collect::<String>().trim().to_string())
-        .filter(|s| !s.is_empty())
-}
-
-fn extract_all_authors(text: &str) -> Vec<String> {
-    let mut authors = Vec::new();
-    for label in ["Scénario", "Scenario", "Dessin"] {
-        if let Some(val) = extract_info_value(text, label) {
-            for a in val.split(',').map(str::trim).filter(|s| !s.is_empty()) {
-                if !authors.contains(&a.to_string()) {
-                    authors.push(a.to_string());
-                }
-            }
-        }
-    }
-    authors
+/// Filter out placeholder author names from Bédéthèque
+fn is_real_author(name: &str) -> bool {
+    !name.starts_with('<') && !name.ends_with('>') && name != "Collectif"
 }

 fn extract_volume_from_title(title: &str) -> Option<i32> {