refactor(lists): title variation processing (#1965)

refactor(lists): remove code duplication from title processing
This commit is contained in:
nuxen 2025-02-13 21:03:29 +01:00 committed by GitHub
parent 06229edb55
commit e581d14066
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 58 additions and 44 deletions

View file

@ -6,28 +6,55 @@ package list
import (
"fmt"
"regexp"
"slices"
"strings"
)
var (
/*
replaceRegexp replaces various character classes/categories such as
\p{P} all Unicode punctuation category characters
\p{S} all Unicode symbol category characters
\p{Z) the Unicode seperator category characters
\p{P} Unicode punctuation category characters
\p{S} Unicode symbol category characters
\p{Z) Unicode seperator category characters
\x{0080}-\x{017F} Unicode block "Latin-1 Supplement" and "Latin Extended-A" characters
https://www.unicode.org/reports/tr44/#General_Category_Values
https://www.regular-expressions.info/unicode.html#category
https://www.compart.com/en/unicode/block/U+0080
https://www.compart.com/en/unicode/block/U+0100
*/
replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`)
questionmarkRegexp = regexp.MustCompile(`[?]{2,}`)
regionCodeRegexp = regexp.MustCompile(`\(\S+\)`) // also cleans titles from years like (YYYY)!
replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`)
questionmarkRegexp = regexp.MustCompile(`[?]{2,}`)
// cleans titles from years and region codes in parentheses, for example (2024) or (US)
parentheticalRegexp = regexp.MustCompile(`\(\S+\)`)
parenthesesEndRegexp = regexp.MustCompile(`\)$`)
apostropheReplacer = strings.NewReplacer("'", "", "´", "", "`", "", "", "", "", "")
)
// yearRegexp = regexp.MustCompile(`\(\d{4}\)$`)
// generateVariations returns variations of the title with optionally removing apostrophes and info in parentheses.
func generateVariations(title string, removeApostrophes, removeParenthetical bool) []string {
var variation string
if removeParenthetical {
variation = parentheticalRegexp.ReplaceAllString(title, "")
variation = strings.TrimRight(variation, " ")
} else {
variation = parenthesesEndRegexp.ReplaceAllString(title, "?")
}
if removeApostrophes {
variation = apostropheReplacer.Replace(variation)
}
variation = replaceRegexp.ReplaceAllString(variation, "?")
variation = questionmarkRegexp.ReplaceAllString(variation, "*")
return []string{
variation,
strings.TrimRight(variation, "?* "),
}
}
// yearRegexp = regexp.MustCompile(`\(\d{4}\)$`)
func processTitle(title string, matchRelease bool) []string {
// Checking if the title is empty.
if strings.TrimSpace(title) == "" {
@ -38,46 +65,25 @@ func processTitle(title string, matchRelease bool) []string {
// var re = regexp.MustCompile(`(?m)\s(\(\d+\))`)
// title = re.ReplaceAllString(title, "")
t := NewTitleSlice()
t := NewTitleSet()
if replaceRegexp.ReplaceAllString(title, "") == "" {
t.Add(title, matchRelease)
} else {
// title with all non-alphanumeric characters replaced by "?"
apostropheTitle := parenthesesEndRegexp.ReplaceAllString(title, "?")
apostropheTitle = replaceRegexp.ReplaceAllString(apostropheTitle, "?")
apostropheTitle = questionmarkRegexp.ReplaceAllString(apostropheTitle, "*")
titles := slices.Concat(
// don't remove apostrophes and info in parentheses
generateVariations(title, false, false),
// remove apostrophes but don't remove info in parentheses
generateVariations(title, true, false),
// don't remove apostrophes but remove info in parentheses
generateVariations(title, false, true),
// remove apostrophes and info in parentheses
generateVariations(title, true, true),
)
t.Add(apostropheTitle, matchRelease)
t.Add(strings.TrimRight(apostropheTitle, "?* "), matchRelease)
// title with apostrophes removed and all non-alphanumeric characters replaced by "?"
noApostropheTitle := parenthesesEndRegexp.ReplaceAllString(title, "?")
noApostropheTitle = strings.NewReplacer("'", "", "´", "", "`", "", "", "", "", "").Replace(noApostropheTitle)
noApostropheTitle = replaceRegexp.ReplaceAllString(noApostropheTitle, "?")
noApostropheTitle = questionmarkRegexp.ReplaceAllString(noApostropheTitle, "*")
t.Add(noApostropheTitle, matchRelease)
t.Add(strings.TrimRight(noApostropheTitle, "?* "), matchRelease)
// title with regions in parentheses removed and all non-alphanumeric characters replaced by "?"
removedRegionCodeApostrophe := regionCodeRegexp.ReplaceAllString(title, "")
removedRegionCodeApostrophe = strings.TrimRight(removedRegionCodeApostrophe, " ")
removedRegionCodeApostrophe = replaceRegexp.ReplaceAllString(removedRegionCodeApostrophe, "?")
removedRegionCodeApostrophe = questionmarkRegexp.ReplaceAllString(removedRegionCodeApostrophe, "*")
t.Add(removedRegionCodeApostrophe, matchRelease)
t.Add(strings.TrimRight(removedRegionCodeApostrophe, "?* "), matchRelease)
// title with regions in parentheses and apostrophes removed and all non-alphanumeric characters replaced by "?"
removedRegionCodeNoApostrophe := regionCodeRegexp.ReplaceAllString(title, "")
removedRegionCodeNoApostrophe = strings.TrimRight(removedRegionCodeNoApostrophe, " ")
removedRegionCodeNoApostrophe = strings.NewReplacer("'", "", "´", "", "`", "", "", "", "", "").Replace(removedRegionCodeNoApostrophe)
removedRegionCodeNoApostrophe = replaceRegexp.ReplaceAllString(removedRegionCodeNoApostrophe, "?")
removedRegionCodeNoApostrophe = questionmarkRegexp.ReplaceAllString(removedRegionCodeNoApostrophe, "*")
t.Add(removedRegionCodeNoApostrophe, matchRelease)
t.Add(strings.TrimRight(removedRegionCodeNoApostrophe, "?* "), matchRelease)
for _, title := range titles {
t.Add(title, matchRelease)
}
}
return t.Titles()
@ -87,7 +93,7 @@ type Titles struct {
tm map[string]struct{}
}
func NewTitleSlice() *Titles {
func NewTitleSet() *Titles {
ts := Titles{
tm: map[string]struct{}{},
}
@ -100,7 +106,7 @@ func (ts *Titles) Add(title string, matchRelease bool) {
}
if matchRelease {
title = strings.Trim(title, "?")
title = strings.Trim(title, "?* ")
title = fmt.Sprintf("*%v*", title)
}

View file

@ -277,6 +277,14 @@ func Test_processTitle(t *testing.T) {
},
want: []string{"pok?mon"},
},
{
name: "test_33",
args: args{
title: "What If…?",
matchRelease: true,
},
want: []string{"*What?If*"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {