refactor(lists): title variation processing (#1965)

refactor(lists): remove code duplication from title processing
This commit is contained in:
nuxen 2025-02-13 21:03:29 +01:00 committed by GitHub
parent 06229edb55
commit e581d14066
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 58 additions and 44 deletions

View file

@ -6,15 +6,16 @@ package list
import ( import (
"fmt" "fmt"
"regexp" "regexp"
"slices"
"strings" "strings"
) )
var ( var (
/* /*
replaceRegexp replaces various character classes/categories such as replaceRegexp replaces various character classes/categories such as
\p{P} all Unicode punctuation category characters \p{P} Unicode punctuation category characters
\p{S} all Unicode symbol category characters \p{S} Unicode symbol category characters
\p{Z) the Unicode seperator category characters \p{Z) Unicode seperator category characters
\x{0080}-\x{017F} Unicode block "Latin-1 Supplement" and "Latin Extended-A" characters \x{0080}-\x{017F} Unicode block "Latin-1 Supplement" and "Latin Extended-A" characters
https://www.unicode.org/reports/tr44/#General_Category_Values https://www.unicode.org/reports/tr44/#General_Category_Values
https://www.regular-expressions.info/unicode.html#category https://www.regular-expressions.info/unicode.html#category
@ -23,10 +24,36 @@ var (
*/ */
replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`) replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`)
questionmarkRegexp = regexp.MustCompile(`[?]{2,}`) questionmarkRegexp = regexp.MustCompile(`[?]{2,}`)
regionCodeRegexp = regexp.MustCompile(`\(\S+\)`) // also cleans titles from years like (YYYY)! // cleans titles from years and region codes in parentheses, for example (2024) or (US)
parentheticalRegexp = regexp.MustCompile(`\(\S+\)`)
parenthesesEndRegexp = regexp.MustCompile(`\)$`) parenthesesEndRegexp = regexp.MustCompile(`\)$`)
apostropheReplacer = strings.NewReplacer("'", "", "´", "", "`", "", "", "", "", "")
) )
// generateVariations returns variations of the title with optionally removing apostrophes and info in parentheses.
func generateVariations(title string, removeApostrophes, removeParenthetical bool) []string {
var variation string
if removeParenthetical {
variation = parentheticalRegexp.ReplaceAllString(title, "")
variation = strings.TrimRight(variation, " ")
} else {
variation = parenthesesEndRegexp.ReplaceAllString(title, "?")
}
if removeApostrophes {
variation = apostropheReplacer.Replace(variation)
}
variation = replaceRegexp.ReplaceAllString(variation, "?")
variation = questionmarkRegexp.ReplaceAllString(variation, "*")
return []string{
variation,
strings.TrimRight(variation, "?* "),
}
}
// yearRegexp = regexp.MustCompile(`\(\d{4}\)$`) // yearRegexp = regexp.MustCompile(`\(\d{4}\)$`)
func processTitle(title string, matchRelease bool) []string { func processTitle(title string, matchRelease bool) []string {
// Checking if the title is empty. // Checking if the title is empty.
@ -38,46 +65,25 @@ func processTitle(title string, matchRelease bool) []string {
// var re = regexp.MustCompile(`(?m)\s(\(\d+\))`) // var re = regexp.MustCompile(`(?m)\s(\(\d+\))`)
// title = re.ReplaceAllString(title, "") // title = re.ReplaceAllString(title, "")
t := NewTitleSlice() t := NewTitleSet()
if replaceRegexp.ReplaceAllString(title, "") == "" { if replaceRegexp.ReplaceAllString(title, "") == "" {
t.Add(title, matchRelease) t.Add(title, matchRelease)
} else { } else {
// title with all non-alphanumeric characters replaced by "?" titles := slices.Concat(
apostropheTitle := parenthesesEndRegexp.ReplaceAllString(title, "?") // don't remove apostrophes and info in parentheses
apostropheTitle = replaceRegexp.ReplaceAllString(apostropheTitle, "?") generateVariations(title, false, false),
apostropheTitle = questionmarkRegexp.ReplaceAllString(apostropheTitle, "*") // remove apostrophes but don't remove info in parentheses
generateVariations(title, true, false),
// don't remove apostrophes but remove info in parentheses
generateVariations(title, false, true),
// remove apostrophes and info in parentheses
generateVariations(title, true, true),
)
t.Add(apostropheTitle, matchRelease) for _, title := range titles {
t.Add(strings.TrimRight(apostropheTitle, "?* "), matchRelease) t.Add(title, matchRelease)
}
// title with apostrophes removed and all non-alphanumeric characters replaced by "?"
noApostropheTitle := parenthesesEndRegexp.ReplaceAllString(title, "?")
noApostropheTitle = strings.NewReplacer("'", "", "´", "", "`", "", "", "", "", "").Replace(noApostropheTitle)
noApostropheTitle = replaceRegexp.ReplaceAllString(noApostropheTitle, "?")
noApostropheTitle = questionmarkRegexp.ReplaceAllString(noApostropheTitle, "*")
t.Add(noApostropheTitle, matchRelease)
t.Add(strings.TrimRight(noApostropheTitle, "?* "), matchRelease)
// title with regions in parentheses removed and all non-alphanumeric characters replaced by "?"
removedRegionCodeApostrophe := regionCodeRegexp.ReplaceAllString(title, "")
removedRegionCodeApostrophe = strings.TrimRight(removedRegionCodeApostrophe, " ")
removedRegionCodeApostrophe = replaceRegexp.ReplaceAllString(removedRegionCodeApostrophe, "?")
removedRegionCodeApostrophe = questionmarkRegexp.ReplaceAllString(removedRegionCodeApostrophe, "*")
t.Add(removedRegionCodeApostrophe, matchRelease)
t.Add(strings.TrimRight(removedRegionCodeApostrophe, "?* "), matchRelease)
// title with regions in parentheses and apostrophes removed and all non-alphanumeric characters replaced by "?"
removedRegionCodeNoApostrophe := regionCodeRegexp.ReplaceAllString(title, "")
removedRegionCodeNoApostrophe = strings.TrimRight(removedRegionCodeNoApostrophe, " ")
removedRegionCodeNoApostrophe = strings.NewReplacer("'", "", "´", "", "`", "", "", "", "", "").Replace(removedRegionCodeNoApostrophe)
removedRegionCodeNoApostrophe = replaceRegexp.ReplaceAllString(removedRegionCodeNoApostrophe, "?")
removedRegionCodeNoApostrophe = questionmarkRegexp.ReplaceAllString(removedRegionCodeNoApostrophe, "*")
t.Add(removedRegionCodeNoApostrophe, matchRelease)
t.Add(strings.TrimRight(removedRegionCodeNoApostrophe, "?* "), matchRelease)
} }
return t.Titles() return t.Titles()
@ -87,7 +93,7 @@ type Titles struct {
tm map[string]struct{} tm map[string]struct{}
} }
func NewTitleSlice() *Titles { func NewTitleSet() *Titles {
ts := Titles{ ts := Titles{
tm: map[string]struct{}{}, tm: map[string]struct{}{},
} }
@ -100,7 +106,7 @@ func (ts *Titles) Add(title string, matchRelease bool) {
} }
if matchRelease { if matchRelease {
title = strings.Trim(title, "?") title = strings.Trim(title, "?* ")
title = fmt.Sprintf("*%v*", title) title = fmt.Sprintf("*%v*", title)
} }

View file

@ -277,6 +277,14 @@ func Test_processTitle(t *testing.T) {
}, },
want: []string{"pok?mon"}, want: []string{"pok?mon"},
}, },
{
name: "test_33",
args: args{
title: "What If…?",
matchRelease: true,
},
want: []string{"*What?If*"},
},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {