From e581d140664070c1af9f47ff489c9b909a83823b Mon Sep 17 00:00:00 2001 From: nuxen Date: Thu, 13 Feb 2025 21:03:29 +0100 Subject: [PATCH] refactor(lists): title variation processing (#1965) refactor(lists): remove code duplication from title processing --- internal/list/title.go | 94 ++++++++++++++++++++----------------- internal/list/title_test.go | 8 ++++ 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/internal/list/title.go b/internal/list/title.go index c054c0a..08826c6 100644 --- a/internal/list/title.go +++ b/internal/list/title.go @@ -6,28 +6,55 @@ package list import ( "fmt" "regexp" + "slices" "strings" ) var ( /* replaceRegexp replaces various character classes/categories such as - \p{P} all Unicode punctuation category characters - \p{S} all Unicode symbol category characters - \p{Z) the Unicode seperator category characters + \p{P} Unicode punctuation category characters + \p{S} Unicode symbol category characters + \p{Z) Unicode seperator category characters \x{0080}-\x{017F} Unicode block "Latin-1 Supplement" and "Latin Extended-A" characters https://www.unicode.org/reports/tr44/#General_Category_Values https://www.regular-expressions.info/unicode.html#category https://www.compart.com/en/unicode/block/U+0080 https://www.compart.com/en/unicode/block/U+0100 */ - replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`) - questionmarkRegexp = regexp.MustCompile(`[?]{2,}`) - regionCodeRegexp = regexp.MustCompile(`\(\S+\)`) // also cleans titles from years like (YYYY)! + replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`) + questionmarkRegexp = regexp.MustCompile(`[?]{2,}`) + // cleans titles from years and region codes in parentheses, for example (2024) or (US) + parentheticalRegexp = regexp.MustCompile(`\(\S+\)`) parenthesesEndRegexp = regexp.MustCompile(`\)$`) + + apostropheReplacer = strings.NewReplacer("'", "", "´", "", "`", "", "‘", "", "’", "") ) -// yearRegexp = regexp.MustCompile(`\(\d{4}\)$`) +// generateVariations returns variations of the title with optionally removing apostrophes and info in parentheses. +func generateVariations(title string, removeApostrophes, removeParenthetical bool) []string { + var variation string + + if removeParenthetical { + variation = parentheticalRegexp.ReplaceAllString(title, "") + variation = strings.TrimRight(variation, " ") + } else { + variation = parenthesesEndRegexp.ReplaceAllString(title, "?") + } + + if removeApostrophes { + variation = apostropheReplacer.Replace(variation) + } + variation = replaceRegexp.ReplaceAllString(variation, "?") + variation = questionmarkRegexp.ReplaceAllString(variation, "*") + + return []string{ + variation, + strings.TrimRight(variation, "?* "), + } +} + +// yearRegexp = regexp.MustCompile(`\(\d{4}\)$`) func processTitle(title string, matchRelease bool) []string { // Checking if the title is empty. if strings.TrimSpace(title) == "" { @@ -38,46 +65,25 @@ func processTitle(title string, matchRelease bool) []string { // var re = regexp.MustCompile(`(?m)\s(\(\d+\))`) // title = re.ReplaceAllString(title, "") - t := NewTitleSlice() + t := NewTitleSet() if replaceRegexp.ReplaceAllString(title, "") == "" { t.Add(title, matchRelease) } else { - // title with all non-alphanumeric characters replaced by "?" - apostropheTitle := parenthesesEndRegexp.ReplaceAllString(title, "?") - apostropheTitle = replaceRegexp.ReplaceAllString(apostropheTitle, "?") - apostropheTitle = questionmarkRegexp.ReplaceAllString(apostropheTitle, "*") + titles := slices.Concat( + // don't remove apostrophes and info in parentheses + generateVariations(title, false, false), + // remove apostrophes but don't remove info in parentheses + generateVariations(title, true, false), + // don't remove apostrophes but remove info in parentheses + generateVariations(title, false, true), + // remove apostrophes and info in parentheses + generateVariations(title, true, true), + ) - t.Add(apostropheTitle, matchRelease) - t.Add(strings.TrimRight(apostropheTitle, "?* "), matchRelease) - - // title with apostrophes removed and all non-alphanumeric characters replaced by "?" - noApostropheTitle := parenthesesEndRegexp.ReplaceAllString(title, "?") - noApostropheTitle = strings.NewReplacer("'", "", "´", "", "`", "", "‘", "", "’", "").Replace(noApostropheTitle) - noApostropheTitle = replaceRegexp.ReplaceAllString(noApostropheTitle, "?") - noApostropheTitle = questionmarkRegexp.ReplaceAllString(noApostropheTitle, "*") - - t.Add(noApostropheTitle, matchRelease) - t.Add(strings.TrimRight(noApostropheTitle, "?* "), matchRelease) - - // title with regions in parentheses removed and all non-alphanumeric characters replaced by "?" - removedRegionCodeApostrophe := regionCodeRegexp.ReplaceAllString(title, "") - removedRegionCodeApostrophe = strings.TrimRight(removedRegionCodeApostrophe, " ") - removedRegionCodeApostrophe = replaceRegexp.ReplaceAllString(removedRegionCodeApostrophe, "?") - removedRegionCodeApostrophe = questionmarkRegexp.ReplaceAllString(removedRegionCodeApostrophe, "*") - - t.Add(removedRegionCodeApostrophe, matchRelease) - t.Add(strings.TrimRight(removedRegionCodeApostrophe, "?* "), matchRelease) - - // title with regions in parentheses and apostrophes removed and all non-alphanumeric characters replaced by "?" - removedRegionCodeNoApostrophe := regionCodeRegexp.ReplaceAllString(title, "") - removedRegionCodeNoApostrophe = strings.TrimRight(removedRegionCodeNoApostrophe, " ") - removedRegionCodeNoApostrophe = strings.NewReplacer("'", "", "´", "", "`", "", "‘", "", "’", "").Replace(removedRegionCodeNoApostrophe) - removedRegionCodeNoApostrophe = replaceRegexp.ReplaceAllString(removedRegionCodeNoApostrophe, "?") - removedRegionCodeNoApostrophe = questionmarkRegexp.ReplaceAllString(removedRegionCodeNoApostrophe, "*") - - t.Add(removedRegionCodeNoApostrophe, matchRelease) - t.Add(strings.TrimRight(removedRegionCodeNoApostrophe, "?* "), matchRelease) + for _, title := range titles { + t.Add(title, matchRelease) + } } return t.Titles() @@ -87,7 +93,7 @@ type Titles struct { tm map[string]struct{} } -func NewTitleSlice() *Titles { +func NewTitleSet() *Titles { ts := Titles{ tm: map[string]struct{}{}, } @@ -100,7 +106,7 @@ func (ts *Titles) Add(title string, matchRelease bool) { } if matchRelease { - title = strings.Trim(title, "?") + title = strings.Trim(title, "?* ") title = fmt.Sprintf("*%v*", title) } diff --git a/internal/list/title_test.go b/internal/list/title_test.go index 539d30f..ae1b7ef 100644 --- a/internal/list/title_test.go +++ b/internal/list/title_test.go @@ -277,6 +277,14 @@ func Test_processTitle(t *testing.T) { }, want: []string{"pok?mon"}, }, + { + name: "test_33", + args: args{ + title: "What If…?", + matchRelease: true, + }, + want: []string{"*What?If*"}, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {