fix: normalize French articles in Bedetheque confidence scoring

Bedetheque uses "Légendaires (Les) - Résistance" while local series
names are "Les légendaires - Résistance". Add normalize_title() that
strips leading articles and articles in parentheses before comparing,
so these forms correctly produce a 100% confidence match.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-18 22:04:58 +01:00
parent b226aa3a35
commit 1e4d9acebe

View File

@@ -610,20 +610,50 @@ fn extract_volume_from_title(title: &str) -> Option<i32> {
None
}
/// Normalize a title by removing French articles (leading or in parentheses)
/// and extra whitespace/punctuation, so that "Les Légendaires - Résistance"
/// and "Légendaires (Les) - Résistance" produce the same canonical form.
fn normalize_title(s: &str) -> String {
let lower = s.to_lowercase();
// Remove articles in parentheses: "(les)", "(la)", "(le)", "(l')", "(un)", "(une)", "(des)"
let re_parens = regex::Regex::new(r"\s*\((?:les?|la|l'|une?|des|du|d')\)").unwrap();
let cleaned = re_parens.replace_all(&lower, "");
// Remove leading articles: "les ", "la ", "le ", "l'", "un ", "une ", "des ", "du ", "d'"
let re_leading = regex::Regex::new(r"^(?:les?|la|l'|une?|des|du|d')\s+").unwrap();
let cleaned = re_leading.replace(&cleaned, "");
// Collapse multiple spaces/dashes into single
let re_spaces = regex::Regex::new(r"\s+").unwrap();
re_spaces.replace_all(cleaned.trim(), " ").to_string()
}
fn compute_confidence(title: &str, query: &str) -> f32 {
let title_lower = title.to_lowercase();
if title_lower == query {
1.0
} else if title_lower.starts_with(query) || query.starts_with(&title_lower) {
let query_lower = query.to_lowercase();
if title_lower == query_lower {
return 1.0;
}
// Try with normalized forms (handles Bedetheque's "Name (Article)" convention)
let title_norm = normalize_title(title);
let query_norm = normalize_title(query);
if title_norm == query_norm {
return 1.0;
}
if title_lower.starts_with(&query_lower) || query_lower.starts_with(&title_lower) {
0.85
} else if title_lower.contains(query) || query.contains(&title_lower) {
} else if title_norm.starts_with(&query_norm) || query_norm.starts_with(&title_norm) {
0.85
} else if title_lower.contains(&query_lower) || query_lower.contains(&title_lower) {
0.7
} else if title_norm.contains(&query_norm) || query_norm.contains(&title_norm) {
0.7
} else {
let common: usize = query
let common: usize = query_lower
.chars()
.filter(|c| title_lower.contains(*c))
.count();
let max_len = query.len().max(title_lower.len()).max(1);
let max_len = query_lower.len().max(title_lower.len()).max(1);
(common as f32 / max_len as f32).clamp(0.1, 0.6)
}
}