mirror of
https://github.com/idanoo/autobrr
synced 2025-07-22 16:29:12 +00:00
refactor(lists): title character filtering (#1955)
This commit is contained in:
parent
4fbaa0b72c
commit
6e77f0339b
3 changed files with 15 additions and 19 deletions
|
@ -7,7 +7,6 @@ import (
|
|||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
|
@ -16,13 +15,6 @@ import (
|
|||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
var (
|
||||
// including math and curreny symbols: $¤<~♡+=^ etc
|
||||
symbolsRegexp = regexp.MustCompile(`\p{S}`)
|
||||
latin1SupplementRegexp = regexp.MustCompile(`[\x{0080}-\x{00FF}]`) // Unicode Block “Latin-1 Supplement”
|
||||
latinExtendedARegexp = regexp.MustCompile(`[\x{0100}-\x{017F}]`)
|
||||
)
|
||||
|
||||
func (s *service) anilist(ctx context.Context, list *domain.List) error {
|
||||
l := s.log.With().Str("type", "anilist").Str("list", list.Name).Logger()
|
||||
|
||||
|
@ -70,11 +62,7 @@ func (s *service) anilist(ctx context.Context, list *domain.List) error {
|
|||
}
|
||||
|
||||
for title := range titlesToProcess {
|
||||
// replace unicode symbols, Unicode Block “Latin-1 Supplement” and Unicode Block “Latin Extended-A” chars by "?"
|
||||
clearedTitle := symbolsRegexp.ReplaceAllString(title, "?")
|
||||
clearedTitle = latin1SupplementRegexp.ReplaceAllString(clearedTitle, "?")
|
||||
clearedTitle = latinExtendedARegexp.ReplaceAllString(clearedTitle, "?")
|
||||
for _, processedTitle := range processTitle(clearedTitle, list.MatchRelease) {
|
||||
for _, processedTitle := range processTitle(title, list.MatchRelease) {
|
||||
titleSet[processedTitle] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,13 +9,21 @@ import (
|
|||
"strings"
|
||||
)
|
||||
|
||||
// Regex patterns
|
||||
// https://www.regular-expressions.info/unicode.html#category
|
||||
// https://www.ncbi.nlm.nih.gov/staff/beck/charents/hex.html
|
||||
var (
|
||||
replaceRegexp = regexp.MustCompile(`[\p{P}\p{Z}\x{00C0}-\x{017E}\x{00AE}]`)
|
||||
/*
|
||||
replaceRegexp replaces various character classes/categories such as
|
||||
\p{P} all Unicode punctuation category characters
|
||||
\p{S} all Unicode symbol category characters
|
||||
\p{Z) the Unicode seperator category characters
|
||||
\x{0080}-\x{017F} Unicode block "Latin-1 Supplement" and "Latin Extended-A" characters
|
||||
https://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
https://www.regular-expressions.info/unicode.html#category
|
||||
https://www.compart.com/en/unicode/block/U+0080
|
||||
https://www.compart.com/en/unicode/block/U+0100
|
||||
*/
|
||||
replaceRegexp = regexp.MustCompile(`[\p{P}\p{S}\p{Z}\x{0080}-\x{017F}]`)
|
||||
questionmarkRegexp = regexp.MustCompile(`[?]{2,}`)
|
||||
regionCodeRegexp = regexp.MustCompile(`\(\S+\)`)
|
||||
regionCodeRegexp = regexp.MustCompile(`\(\S+\)`) // also cleans titles from years like (YYYY)!
|
||||
parenthesesEndRegexp = regexp.MustCompile(`\)$`)
|
||||
)
|
||||
|
||||
|
|
|
@ -267,7 +267,7 @@ func Test_processTitle(t *testing.T) {
|
|||
title: "solo leveling 2ª temporada -ergam-se das sombras-",
|
||||
matchRelease: false,
|
||||
},
|
||||
want: []string{"solo?leveling?2ª?temporada*ergam?se?das?sombras", "solo?leveling?2ª?temporada*ergam?se?das?sombras?"},
|
||||
want: []string{"solo?leveling?2*temporada*ergam?se?das?sombras", "solo?leveling?2*temporada*ergam?se?das?sombras?"},
|
||||
},
|
||||
{
|
||||
name: "test_32",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue