diff --git a/internal/list/process_list_anilist.go b/internal/list/process_list_anilist.go index 7e5c0a8..39e344f 100644 --- a/internal/list/process_list_anilist.go +++ b/internal/list/process_list_anilist.go @@ -7,7 +7,6 @@ import ( "context" "encoding/json" "net/http" - "regexp" "sort" "strings" @@ -16,13 +15,6 @@ import ( "github.com/pkg/errors" ) -var ( - // including math and curreny symbols: $¤<~♡+=^ etc - symbolsRegexp = regexp.MustCompile(`\p{S}`) - latin1SupplementRegexp = regexp.MustCompile(`[\x{0080}-\x{00FF}]`) // Unicode Block “Latin-1 Supplement” - latinExtendedARegexp = regexp.MustCompile(`[\x{0100}-\x{017F}]`) -) - func (s *service) anilist(ctx context.Context, list *domain.List) error { l := s.log.With().Str("type", "anilist").Str("list", list.Name).Logger() @@ -70,11 +62,7 @@ func (s *service) anilist(ctx context.Context, list *domain.List) error { } for title := range titlesToProcess { - // replace unicode symbols, Unicode Block “Latin-1 Supplement” and Unicode Block “Latin Extended-A” chars by "?" - clearedTitle := symbolsRegexp.ReplaceAllString(title, "?") - clearedTitle = latin1SupplementRegexp.ReplaceAllString(clearedTitle, "?") - clearedTitle = latinExtendedARegexp.ReplaceAllString(clearedTitle, "?") - for _, processedTitle := range processTitle(clearedTitle, list.MatchRelease) { + for _, processedTitle := range processTitle(title, list.MatchRelease) { titleSet[processedTitle] = struct{}{} } } diff --git a/internal/list/title.go b/internal/list/title.go index 2662366..c054c0a 100644 --- a/internal/list/title.go +++ b/internal/list/title.go @@ -9,13 +9,21 @@ import ( "strings" ) -// Regex patterns -// https://www.regular-expressions.info/unicode.html#category -// https://www.ncbi.nlm.nih.gov/staff/beck/charents/hex.html var ( - replaceRegexp = regexp.MustCompile(`[\p{P}\p{Z}\x{00C0}-\x{017E}\x{00AE}]`) + /* + replaceRegexp replaces various character classes/categories such as + \p{P} all Unicode punctuation category characters + \p{S} all Unicode symbol category characters + \p{Z) the Unicode seperator category characters + \x{0080}-\x{017F} Unicode block "Latin-1 Supplement" and "Latin Extended-A" characters + https://www.unicode.org/reports/tr44/#General_Category_Values + https://www.regular-expressions.info/unicode.html#category + https://www.compart.com/en/unicode/block/U+0080 + https://www.compart.com/en/unicode/block/U+0100 + */ + replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`) questionmarkRegexp = regexp.MustCompile(`[?]{2,}`) - regionCodeRegexp = regexp.MustCompile(`\(\S+\)`) + regionCodeRegexp = regexp.MustCompile(`\(\S+\)`) // also cleans titles from years like (YYYY)! parenthesesEndRegexp = regexp.MustCompile(`\)$`) ) diff --git a/internal/list/title_test.go b/internal/list/title_test.go index a22961b..539d30f 100644 --- a/internal/list/title_test.go +++ b/internal/list/title_test.go @@ -267,7 +267,7 @@ func Test_processTitle(t *testing.T) { title: "solo leveling 2ª temporada -ergam-se das sombras-", matchRelease: false, }, - want: []string{"solo?leveling?2ª?temporada*ergam?se?das?sombras", "solo?leveling?2ª?temporada*ergam?se?das?sombras?"}, + want: []string{"solo?leveling?2*temporada*ergam?se?das?sombras", "solo?leveling?2*temporada*ergam?se?das?sombras?"}, }, { name: "test_32",