feat(filters): skip duplicates (#1711)

* feat(filters): skip duplicates

* fix: add interface instead of any

* fix(filters): tonullint

* feat(filters): skip dupes check month day

* chore: cleanup

* feat(db): set autoincrement id

* feat(filters): add repack and proper to dupe profile

* feat(filters): add default dupe profiles

* feat(duplicates): check audio and website

* feat(duplicates): update tests

* feat(duplicates): add toggles on addform

* feat(duplicates): fix sqlite upgrade path and initialize duplicate profiles

* feat(duplicates): simplify sqlite upgrade

avoiding temp table and unwieldy select.  Besides, FK constraints
are turned off anyway in #229.

* feat(duplicates): change CheckIsDuplicateRelease treatment of PROPER and REPACK

"Proper" and "Repack" are not parallel to the other conditions like "Title",
so they do not belong as dedup conditions.  "PROPER" means there was an issue in
the previous release, and so a PROPER is never a duplicate, even if it replaces
another PROPER.  Similarly, "REPACK" means there was an issue in the previous
release by that group, and so it is a duplicate only if we previously took a
release from a DIFFERENT group.

I have not removed Proper and Repack from the UI or the schema yet.

* feat(duplicates): update postgres schema to match sqlite

* feat(duplicates): fix web build errors

* feat(duplicates): fix postgres errors

* feat(filters): do leftjoin for duplicate profile

* fix(filters): partial update dupe profile

* go fmt `internal/domain/filter.go`

* feat(duplicates): restore straightforward logic for proper/repack

* feat(duplicates): remove mostly duplicate TV duplicate profiles

Having one profile seems the cleanest.  If somebody wants multiple
resolutions then they can add Resolution to the duplicate profile.
Tested this profile with both weekly episodic releases and daily
show releases.

* feat(release): add db indexes and sub_title

* feat(release): add IsDuplicate tests

* feat(release): update action handler

* feat(release): add more tests for skip duplicates

* feat(duplicates): check audio

* feat(duplicates): add more tests

* feat(duplicates): match edition cut and more

* fix(duplicates): tests

* fix(duplicates): missing imports

* fix(duplicates): tests

* feat(duplicates): handle sub_title edition and language in ui

* fix(duplicates): tests

* feat(duplicates): check name against normalized hash

* fix(duplicates): tests

* chore: update .gitignore to ignore .pnpm-store

* fix: tests

* fix(filters): tests

* fix: bad conflict merge

* fix: update release type in test

* fix: use vendored hot-toast

* fix: release_test.go

* fix: rss_test.go

* feat(duplicates): improve title hashing for unique check

* feat(duplicates): further improve title hashing for unique check with lang

* feat(duplicates): fix tests

* feat(duplicates): add macros IsDuplicate and DuplicateProfile ID and name

* feat(duplicates): add normalized hash match option

* fix: headlessui-state prop warning

* fix(duplicates): add missing year in daily ep normalize

* fix(duplicates): check rejections len

---------

Co-authored-by: ze0s <ze0s@riseup.net>
This commit is contained in:
kenstir 2024-12-25 16:33:46 -05:00 committed by GitHub
parent d153ac44b8
commit 4009554d10
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
49 changed files with 3792 additions and 743 deletions

View file

@ -7,7 +7,11 @@ import (
"bufio"
"bytes"
"context"
"crypto/md5"
"encoding/hex"
"fmt"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
"html"
"io"
"math"
@ -18,6 +22,7 @@ import (
"strconv"
"strings"
"time"
"unicode"
"github.com/autobrr/autobrr/pkg/errors"
"github.com/autobrr/autobrr/pkg/sharedhttp"
@ -43,6 +48,11 @@ type ReleaseRepo interface {
GetActionStatus(ctx context.Context, req *GetReleaseActionStatusRequest) (*ReleaseActionStatus, error)
StoreReleaseActionStatus(ctx context.Context, status *ReleaseActionStatus) error
StoreDuplicateProfile(ctx context.Context, profile *DuplicateReleaseProfile) error
FindDuplicateReleaseProfiles(ctx context.Context) ([]*DuplicateReleaseProfile, error)
DeleteReleaseProfileDuplicate(ctx context.Context, id int64) error
CheckIsDuplicateRelease(ctx context.Context, profile *DuplicateReleaseProfile, release *Release) (bool, error)
}
type Release struct {
@ -55,6 +65,7 @@ type Release struct {
Implementation ReleaseImplementation `json:"implementation"` // irc, rss, api
Timestamp time.Time `json:"timestamp"`
AnnounceType AnnounceType `json:"announce_type"`
Type rls.Type `json:"type"` // rls.Type
InfoURL string `json:"info_url"`
DownloadURL string `json:"download_url"`
MagnetURI string `json:"-"`
@ -63,9 +74,11 @@ type Release struct {
TorrentTmpFile string `json:"-"`
TorrentDataRawBytes []byte `json:"-"`
TorrentHash string `json:"-"`
TorrentName string `json:"name"` // full release name
TorrentName string `json:"name"` // full release name
NormalizedHash string `json:"normalized_hash"` // normalized torrent name and md5 hashed
Size uint64 `json:"size"`
Title string `json:"title"` // Parsed title
Title string `json:"title"` // Parsed title
SubTitle string `json:"sub_title"` // Parsed secondary title for shows e.g. episode name
Description string `json:"-"`
Category string `json:"category"`
Categories []string `json:"categories,omitempty"`
@ -89,8 +102,11 @@ type Release struct {
Proper bool `json:"proper"`
Repack bool `json:"repack"`
Website string `json:"website"`
Hybrid bool `json:"hybrid"`
Edition []string `json:"edition"`
Cut []string `json:"cut"`
MediaProcessing string `json:"media_processing"` // Remux, Encode, Untouched
Artists string `json:"-"`
Type string `json:"type"` // Album,Single,EP
LogScore int `json:"-"`
HasCue bool `json:"-"`
HasLog bool `json:"-"`
@ -110,15 +126,183 @@ type Release struct {
AdditionalSizeCheckRequired bool `json:"-"`
AdditionalUploaderCheckRequired bool `json:"-"`
AdditionalRecordLabelCheckRequired bool `json:"-"`
IsDuplicate bool `json:"-"`
SkipDuplicateProfileID int64 `json:"-"`
SkipDuplicateProfileName string `json:"-"`
FilterID int `json:"-"`
Filter *Filter `json:"-"`
ActionStatus []ReleaseActionStatus `json:"action_status"`
}
// Hash return md5 hashed normalized release name
func (r *Release) Hash() string {
formatted := r.TorrentName
// for tv and movies we create the formatted title to have the best chance of matching
if r.IsTypeVideo() {
formatted = r.NormalizedTitle()
}
normalized := MustNormalize(formatted)
h := md5.Sum([]byte(normalized))
str := hex.EncodeToString(h[:])
return str
}
// MustNormalize applies the Normalize transform to s, returning a lower cased,
// clean form of s useful for matching titles.
func MustNormalize(s string) string {
s, _, err := transform.String(NewNormalizer(), s)
if err != nil {
panic(err)
}
return s
}
// NewNormalizer is a custom rls.Normalizer that keeps plus sign + for HDR10+ fx
// It creates a new a text transformer chain (similiar to
// NewCleaner) that normalizes text to lower case clean form useful for
// matching titles.
//
// See: https://go.dev/blog/normalization
func NewNormalizer() transform.Transformer {
return transform.Chain(
norm.NFD,
rls.NewCollapser(
true, true,
"`"+`':;~!@#%^*=()[]{}<>/?|\",`, " \t\r\n\f._",
func(r, prev, next rune) rune {
switch {
case r == '-' && unicode.IsSpace(prev):
return -1
case r == '$' && (unicode.IsLetter(prev) || unicode.IsLetter(next)):
return 'S'
case r == '£' && (unicode.IsLetter(prev) || unicode.IsLetter(next)):
return 'L'
case r == '$', r == '£':
return -1
}
return r
},
),
norm.NFC,
)
}
func (r *Release) NormalizedTitle() string {
var v []string
v = append(v, r.Title)
if r.Year > 0 && r.Month > 0 && r.Day > 0 {
v = append(v, fmt.Sprintf("%d %d %d", r.Year, r.Month, r.Day))
} else if r.Year > 0 {
v = append(v, fmt.Sprintf("%d", r.Year))
}
if len(r.Language) > 0 {
v = append(v, strings.Join(r.Language, " "))
}
if len(r.Cut) > 0 {
v = append(v, strings.Join(r.Cut, " "))
}
if len(r.Edition) > 0 {
v = append(v, strings.Join(r.Edition, " "))
}
if r.Season > 0 && r.Episode > 0 {
v = append(v, fmt.Sprintf("S%dE%d", r.Season, r.Episode))
} else if r.Season > 0 && r.Episode == 0 {
v = append(v, fmt.Sprintf("S%d", r.Season))
}
if r.Proper {
v = append(v, "PROPER")
}
if r.Repack {
v = append(v, r.RepackStr())
}
if r.Hybrid {
v = append(v, "HYBRiD")
}
if r.SubTitle != "" {
v = append(v, r.SubTitle)
}
if r.Resolution != "" {
v = append(v, r.Resolution)
}
if r.Website != "" {
v = append(v, r.Website)
}
if r.Region != "" {
v = append(v, r.Region)
}
if r.Source != "" {
v = append(v, r.Source)
}
// remux
if r.MediaProcessing == "REMUX" {
v = append(v, "REMUX")
}
if len(r.Codec) > 0 {
v = append(v, strings.Join(r.Codec, " "))
}
if len(r.HDR) > 0 {
v = append(v, strings.Join(r.HDR, " "))
}
if len(r.Audio) > 0 {
v = append(v, r.AudioString())
}
str := strings.Join(v, " ")
if r.Group != "" {
str = fmt.Sprintf("%s-%s", str, r.Group)
}
return str
}
func (r *Release) RepackStr() string {
if r.Other != nil {
if slices.Contains(r.Other, "REPACK") {
return "REPACK"
} else if slices.Contains(r.Other, "REREPACK") {
return "REREPACK"
} else if slices.Contains(r.Other, "REPACK2") {
return "REPACK2"
} else if slices.Contains(r.Other, "REPACK3") {
return "REPACK3"
}
}
return ""
}
func (r *Release) Raw(s string) rls.Release {
return rls.ParseString(s)
}
func (r *Release) ParseType(s string) {
r.Type = rls.ParseType(s)
}
func (r *Release) IsTypeVideo() bool {
return r.Type.Is(rls.Movie, rls.Series, rls.Episode)
}
type AnnounceType string
const (
@ -361,6 +545,10 @@ func NewRelease(indexer IndexerMinimal) *Release {
Implementation: ReleaseImplementationIRC,
Timestamp: time.Now(),
Tags: []string{},
Language: []string{},
Edition: []string{},
Cut: []string{},
Other: []string{},
Size: 0,
AnnounceType: AnnounceTypeNew,
}
@ -371,28 +559,42 @@ func NewRelease(indexer IndexerMinimal) *Release {
func (r *Release) ParseString(title string) {
rel := rls.ParseString(title)
r.Type = rel.Type.String()
r.Type = rel.Type
r.TorrentName = title
r.Source = rel.Source
r.Resolution = rel.Resolution
r.Region = rel.Region
if rel.Language != nil {
r.Language = rel.Language
}
r.Audio = rel.Audio
r.AudioChannels = rel.Channels
r.Codec = rel.Codec
r.Container = rel.Container
r.HDR = rel.HDR
r.Artists = rel.Artist
r.Language = rel.Language
r.Other = rel.Other
if rel.Other != nil {
r.Other = rel.Other
}
r.Proper = slices.Contains(r.Other, "PROPER")
r.Repack = slices.Contains(r.Other, "REPACK")
r.Repack = slices.Contains(r.Other, "REPACK") || slices.Contains(r.Other, "REREPACK")
r.Hybrid = slices.Contains(r.Other, "HYBRiD")
// TODO default to Encode and set Untouched for discs
if slices.Contains(r.Other, "REMUX") {
r.MediaProcessing = "REMUX"
}
if r.Title == "" {
r.Title = rel.Title
}
r.SubTitle = rel.Subtitle
if r.Season == 0 {
r.Season = rel.Series
@ -415,8 +617,22 @@ func (r *Release) ParseString(title string) {
r.Group = rel.Group
}
if r.Website == "" {
r.Website = rel.Collection
}
if rel.Cut != nil {
r.Cut = rel.Cut
}
if rel.Edition != nil {
r.Edition = rel.Edition
}
r.ParseReleaseTagsString(r.ReleaseTags)
r.extraParseSource(rel)
r.NormalizedHash = r.Hash()
}
func (r *Release) extraParseSource(rel rls.Release) {
@ -451,7 +667,7 @@ func (r *Release) extraParseSource(rel rls.Release) {
}
// check res to be 1080p or 2160p and codec to be AVC, HEVC or if other contains Remux, then set source to BluRay if it differs
if !basicContainsSlice(r.Source, []string{"WEB-DL", "BluRay", "UHD.BluRay"}) && basicContainsSlice(r.Resolution, []string{"1080p", "2160p"}) && basicContainsMatch(r.Codec, []string{"AVC", "HEVC"}) && basicContainsMatch(r.Other, []string{"REMUX"}) {
if !basicContainsSlice(r.Source, []string{"WEB-DL", "BluRay", "UHD.BluRay"}) && basicContainsSlice(r.Resolution, []string{"1080p", "2160p"}) && basicContainsMatch(r.Codec, []string{"AVC", "H.264", "H.265", "HEVC"}) && basicContainsMatch(r.Other, []string{"REMUX"}) {
// handle missing or unexpected source for some bluray releases
if r.Resolution == "1080p" {
r.Source = "BluRay"
@ -463,6 +679,10 @@ func (r *Release) extraParseSource(rel rls.Release) {
}
func (r *Release) ParseReleaseTagsString(tags string) {
if tags == "" {
return
}
cleanTags := CleanReleaseTags(tags)
t := ParseReleaseTagString(cleanTags)
@ -543,6 +763,20 @@ func (r *Release) OpenTorrentFile() error {
return nil
}
// AudioString takes r.Audio and r.AudioChannels and returns a string like "DDP Atmos 5.1"
func (r *Release) AudioString() string {
var audio []string
audio = append(audio, r.Audio...)
audio = append(audio, r.AudioChannels)
if len(audio) > 0 {
return strings.Join(audio, " ")
}
return ""
}
func (r *Release) DownloadTorrentFileCtx(ctx context.Context) error {
return r.downloadTorrentFile(ctx)
}
@ -992,3 +1226,30 @@ func getUniqueTags(target []string, source []string) []string {
return target
}
type DuplicateReleaseProfile struct {
ID int64 `json:"id"`
Name string `json:"name"`
Protocol bool `json:"protocol"`
ReleaseName bool `json:"release_name"`
Hash bool `json:"hash"`
Title bool `json:"title"`
SubTitle bool `json:"sub_title"`
Year bool `json:"year"`
Month bool `json:"month"`
Day bool `json:"day"`
Source bool `json:"source"`
Resolution bool `json:"resolution"`
Codec bool `json:"codec"`
Container bool `json:"container"`
DynamicRange bool `json:"dynamic_range"`
Audio bool `json:"audio"`
Group bool `json:"group"`
Season bool `json:"season"`
Episode bool `json:"episode"`
Website bool `json:"website"`
Proper bool `json:"proper"`
Repack bool `json:"repack"`
Edition bool `json:"edition"`
Language bool `json:"language"`
}