refactor(lists): title character filtering (#1955)

This commit is contained in:
martylukyy 2025-02-08 14:16:54 +01:00 committed by GitHub
parent 4fbaa0b72c
commit 6e77f0339b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 15 additions and 19 deletions

View file

@ -7,7 +7,6 @@ import (
"context"
"encoding/json"
"net/http"
"regexp"
"sort"
"strings"
@ -16,13 +15,6 @@ import (
"github.com/pkg/errors"
)
var (
// including math and curreny symbols: $¤<~♡+=^ etc
symbolsRegexp = regexp.MustCompile(`\p{S}`)
latin1SupplementRegexp = regexp.MustCompile(`[\x{0080}-\x{00FF}]`) // Unicode Block “Latin-1 Supplement”
latinExtendedARegexp = regexp.MustCompile(`[\x{0100}-\x{017F}]`)
)
func (s *service) anilist(ctx context.Context, list *domain.List) error {
l := s.log.With().Str("type", "anilist").Str("list", list.Name).Logger()
@ -70,11 +62,7 @@ func (s *service) anilist(ctx context.Context, list *domain.List) error {
}
for title := range titlesToProcess {
// replace unicode symbols, Unicode Block “Latin-1 Supplement” and Unicode Block “Latin Extended-A” chars by "?"
clearedTitle := symbolsRegexp.ReplaceAllString(title, "?")
clearedTitle = latin1SupplementRegexp.ReplaceAllString(clearedTitle, "?")
clearedTitle = latinExtendedARegexp.ReplaceAllString(clearedTitle, "?")
for _, processedTitle := range processTitle(clearedTitle, list.MatchRelease) {
for _, processedTitle := range processTitle(title, list.MatchRelease) {
titleSet[processedTitle] = struct{}{}
}
}

View file

@ -9,13 +9,21 @@ import (
"strings"
)
// Regex patterns
// https://www.regular-expressions.info/unicode.html#category
// https://www.ncbi.nlm.nih.gov/staff/beck/charents/hex.html
var (
replaceRegexp = regexp.MustCompile(`[\p{P}\p{Z}\x{00C0}-\x{017E}\x{00AE}]`)
/*
replaceRegexp replaces various character classes/categories such as
\p{P} all Unicode punctuation category characters
\p{S} all Unicode symbol category characters
\p{Z) the Unicode seperator category characters
\x{0080}-\x{017F} Unicode block "Latin-1 Supplement" and "Latin Extended-A" characters
https://www.unicode.org/reports/tr44/#General_Category_Values
https://www.regular-expressions.info/unicode.html#category
https://www.compart.com/en/unicode/block/U+0080
https://www.compart.com/en/unicode/block/U+0100
*/
replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`)
questionmarkRegexp = regexp.MustCompile(`[?]{2,}`)
regionCodeRegexp = regexp.MustCompile(`\(\S+\)`)
regionCodeRegexp = regexp.MustCompile(`\(\S+\)`) // also cleans titles from years like (YYYY)!
parenthesesEndRegexp = regexp.MustCompile(`\)$`)
)

View file

@ -267,7 +267,7 @@ func Test_processTitle(t *testing.T) {
title: "solo leveling 2ª temporada -ergam-se das sombras-",
matchRelease: false,
},
want: []string{"solo?leveling?2ª?temporada*ergam?se?das?sombras", "solo?leveling?2ª?temporada*ergam?se?das?sombras?"},
want: []string{"solo?leveling?2*temporada*ergam?se?das?sombras", "solo?leveling?2*temporada*ergam?se?das?sombras?"},
},
{
name: "test_32",