fix: normalize French articles in Bedetheque confidence scoring

Bedetheque uses "Légendaires (Les) - Résistance" while local series names are "Les légendaires - Résistance". Add normalize_title() that strips leading articles and articles in parentheses before comparing, so these forms correctly produce a 100% confidence match. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 22:04:58 +01:00
parent b226aa3a35
commit 1e4d9acebe
1 changed files with 36 additions and 6 deletions
--- a/apps/api/src/metadata_providers/bedetheque.rs
+++ b/apps/api/src/metadata_providers/bedetheque.rs
@@ -610,20 +610,50 @@ fn extract_volume_from_title(title: &str) -> Option<i32> {
    None
 }

+/// Normalize a title by removing French articles (leading or in parentheses)
+/// and extra whitespace/punctuation, so that "Les Légendaires - Résistance"
+/// and "Légendaires (Les) - Résistance" produce the same canonical form.
+fn normalize_title(s: &str) -> String {
+    let lower = s.to_lowercase();
+    // Remove articles in parentheses: "(les)", "(la)", "(le)", "(l')", "(un)", "(une)", "(des)"
+    let re_parens = regex::Regex::new(r"\s*\((?:les?|la|l'|une?|des|du|d')\)").unwrap();
+    let cleaned = re_parens.replace_all(&lower, "");
+    // Remove leading articles: "les ", "la ", "le ", "l'", "un ", "une ", "des ", "du ", "d'"
+    let re_leading = regex::Regex::new(r"^(?:les?|la|l'|une?|des|du|d')\s+").unwrap();
+    let cleaned = re_leading.replace(&cleaned, "");
+    // Collapse multiple spaces/dashes into single
+    let re_spaces = regex::Regex::new(r"\s+").unwrap();
+    re_spaces.replace_all(cleaned.trim(), " ").to_string()
+}
+
 fn compute_confidence(title: &str, query: &str) -> f32 {
    let title_lower = title.to_lowercase();
-    if title_lower == query {
-        1.0
-    } else if title_lower.starts_with(query) || query.starts_with(&title_lower) {
+    let query_lower = query.to_lowercase();
+    if title_lower == query_lower {
+        return 1.0;
+    }
+
+    // Try with normalized forms (handles Bedetheque's "Name (Article)" convention)
+    let title_norm = normalize_title(title);
+    let query_norm = normalize_title(query);
+    if title_norm == query_norm {
+        return 1.0;
+    }
+
+    if title_lower.starts_with(&query_lower) || query_lower.starts_with(&title_lower) {
        0.85
-    } else if title_lower.contains(query) || query.contains(&title_lower) {
+    } else if title_norm.starts_with(&query_norm) || query_norm.starts_with(&title_norm) {
+        0.85
+    } else if title_lower.contains(&query_lower) || query_lower.contains(&title_lower) {
+        0.7
+    } else if title_norm.contains(&query_norm) || query_norm.contains(&title_norm) {
        0.7
    } else {
-        let common: usize = query
+        let common: usize = query_lower
            .chars()
            .filter(|c| title_lower.contains(*c))
            .count();
-        let max_len = query.len().max(title_lower.len()).max(1);
+        let max_len = query_lower.len().max(title_lower.len()).max(1);
        (common as f32 / max_len as f32).clamp(0.1, 0.6)
    }
 }