Feature: Improve filtering and release parsing (#56)

* feat: match against orig and cleaned rel

* feat: add more release parse tests

* feat: filter check tags

* feat: improve filter tag parsing

* refactor: simplify tag split and trim

* fix(indexers): use releasetags for milkie

* fix: properly replace spaces in string

* feat: better source check

* feat: extract releasetags
This commit is contained in:
Ludvig Lundgren 2022-01-01 21:50:38 +01:00 committed by GitHub
parent 8f53becbb3
commit ae1f14d0a4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 1036 additions and 130 deletions

View file

@ -40,6 +40,7 @@ type Release struct {
TorrentName string `json:"torrent_name"` // full release name
Size uint64 `json:"size"`
Raw string `json:"raw"` // Raw release
Clean string `json:"clean"` // cleaned release name
Title string `json:"title"` // Parsed title
Category string `json:"category"`
Season int `json:"season"`
@ -71,6 +72,7 @@ type Release struct {
IsScene bool `json:"is_scene"`
Origin string `json:"origin"` // P2P, Internal
Tags []string `json:"tags"`
ReleaseTags string `json:"-"`
Freeleech bool `json:"freeleech"`
FreeleechPercent int `json:"freeleech_percent"`
Uploader string `json:"uploader"`
@ -118,6 +120,9 @@ func (r *Release) Parse() error {
err = r.extractProper()
err = r.extractRepack()
err = r.extractWebsite()
err = r.extractReleaseTags()
r.Clean = cleanReleaseName(r.TorrentName)
if err != nil {
log.Trace().Msgf("could not parse release: %v", r.TorrentName)
@ -177,6 +182,19 @@ func (r *Release) extractSource() error {
return nil
}
func (r *Release) extractSourceFromTags(tag string) error {
if r.Source != "" {
return nil
}
v, err := findLast(tag, `(?i)\b(((?:PPV\.)?[HP]DTV|(?:HD)?CAM|B[DR]Rip|(?:HD-?)?TS|(?:PPV )?WEB-?DL(?: DVDRip)?|HDRip|DVDRip|DVDRIP|CamRip|WEB|W[EB]BRip|Blu-?Ray|DvDScr|telesync|CD|DVD|Vinyl|DAT|Cassette))\b`)
if err != nil {
return err
}
r.Source = v
return nil
}
func (r *Release) extractCodec() error {
v, err := findLast(r.TorrentName, `(?i)\b(HEVC|[hx]\.?26[45]|xvid|divx|AVC|MPEG-?2|AV1|VC-?1|VP9|WebP)\b`)
if err != nil {
@ -197,6 +215,20 @@ func (r *Release) extractContainer() error {
return nil
}
func (r *Release) extractContainerFromTags(tag string) error {
if r.Container != "" {
return nil
}
v, err := findLast(tag, `(?i)\b(AVI|MPG|MKV|MP4|VOB|m2ts|ISO|IMG)\b`)
if err != nil {
return err
}
r.Container = v
return nil
}
func (r *Release) extractHDR() error {
v, err := findLast(r.TorrentName, `(?i)(HDR10\+|HDR10|DoVi HDR|DV HDR|HDR|DV|DoVi|Dolby Vision \+ HDR10|Dolby Vision)`)
if err != nil {
@ -217,22 +249,37 @@ func (r *Release) extractAudio() error {
return nil
}
func (r *Release) extractGroup() error {
// try first for wierd anime group names [group] show name, or in brackets at the end
group := ""
func (r *Release) extractAudioFromTags(tag string) error {
if r.Audio != "" {
return nil
}
g, err := findLast(r.TorrentName, `\[(.*?)\]`)
v, err := findLast(tag, `(?i)(MP3|Ogg Vorbis|FLAC[\. ][1-7][\. ][0-2]|FLAC|Opus|DD-EX|DDP[\. ]?[124567][\. ][012] Atmos|DDP[\. ]?[124567][\. ][012]|DDP|DD[1-7][\. ][0-2]|Dual[\- ]Audio|LiNE|PCM|Dolby TrueHD [0-9][\. ][0-4]|TrueHD [0-9][\. ][0-4] Atmos|TrueHD [0-9][\. ][0-4]|DTS X|DTS-HD MA [0-9][\. ][0-4]|DTS-HD MA|DTS-ES|DTS [1-7][\. ][0-2]|DTS|DD|DD[12][\. ]0|Dolby Atmos|TrueHD ATMOS|TrueHD|Atmos|Dolby Digital Plus|Dolby Digital Audio|Dolby Digital|AAC[.-]LC|AAC (?:\.?[1-7]\.[0-2])?|AAC|eac3|AC3(?:\.5\.1)?)`)
if err != nil {
return err
}
group = g
r.Audio = v
if group == "" {
g2, err := findLast(r.TorrentName, `(- ?([^-]+(?:-={[^-]+-?$)?))$`)
if err != nil {
return err
}
group = g2
return nil
}
//func (r *Release) extractCueFromTags(tag string) error {
// v, err := findLast(tag, `Cue`)
// if err != nil {
// return err
// }
// r.HasCue = v
//
// return nil
//}
func (r *Release) extractGroup() error {
// try first for wierd anime group names [group] show name, or in brackets at the end
//g, err := findLast(r.Clean, `\[(.*?)\]`)
group, err := findLast(r.TorrentName, `\-([a-zA-Z0-9_\.]+)$`)
if err != nil {
return err
}
r.Group = group
@ -321,6 +368,116 @@ func (r *Release) extractWebsite() error {
return nil
}
func (r *Release) extractFreeleechFromTags(tag string) error {
if r.Freeleech == true {
return nil
}
// Start with the basic most common ones
v, err := findLast(tag, `Freeleech!`)
if err != nil {
return err
}
if v != "" {
r.Freeleech = true
return nil
}
r.Freeleech = false
return nil
}
func (r *Release) extractLogScoreFromTags(tag string) error {
if r.LogScore > 0 {
return nil
}
// Start with the basic most common ones
rxp, err := regexp.Compile(`([\d\.]+)%`)
if err != nil {
return err
//return errors.Wrapf(err, "invalid regex: %s", value)
}
matches := rxp.FindStringSubmatch(tag)
if matches != nil {
// first value is the match, second value is the text
if len(matches) >= 1 {
last := matches[len(matches)-1]
score, err := strconv.ParseInt(last, 10, 32)
if err != nil {
return err
}
r.LogScore = int(score)
return nil
}
}
return nil
}
func (r *Release) extractBitrateFromTags(tag string) error {
if r.Bitrate != "" {
return nil
}
// Start with the basic most common ones
rxp, err := regexp.Compile(`^(?:vbr|aps|apx|v\d|\d{2,4}|\d+\.\d+|q\d+\.[\dx]+|Other)?(?:\s*kbps|\s*kbits?|\s*k)?(?:\s*\(?(?:vbr|cbr)\)?)?$`)
if err != nil {
return err
//return errors.Wrapf(err, "invalid regex: %s", value)
}
matches := rxp.FindStringSubmatch(tag)
if matches != nil {
// first value is the match, second value is the text
if len(matches) >= 1 {
last := matches[len(matches)-1]
r.Bitrate = last
return nil
}
}
return nil
}
func (r *Release) extractReleaseTags() error {
if r.ReleaseTags == "" {
return nil
}
tags := SplitAny(r.ReleaseTags, ",|/ ")
for _, t := range tags {
var err error
err = r.extractAudioFromTags(t)
err = r.extractContainerFromTags(t)
err = r.extractSourceFromTags(t)
err = r.extractFreeleechFromTags(t)
err = r.extractLogScoreFromTags(t)
err = r.extractBitrateFromTags(t)
if err != nil {
continue
}
switch t {
case "Cue":
r.HasCue = true
case "Log":
r.HasLog = true
// check percent
}
}
return nil
}
func (r *Release) addRejection(reason string) {
r.Rejections = append(r.Rejections, reason)
}
@ -354,8 +511,8 @@ func (r *Release) CheckFilter(filter Filter) bool {
return false
}
// check against title when parsed correctly
if filter.Shows != "" && !checkFilterStrings(r.TorrentName, filter.Shows) {
// check against TorrentName and Clean which is a cleaned name without (. _ -)
if filter.Shows != "" && !checkMultipleFilterStrings(filter.Shows, r.TorrentName, r.Clean) {
r.addRejection("shows not matching")
return false
}
@ -372,22 +529,22 @@ func (r *Release) CheckFilter(filter Filter) bool {
// matchRelease
// TODO allow to match against regex
if filter.MatchReleases != "" && !checkFilterStrings(r.TorrentName, filter.MatchReleases) {
if filter.MatchReleases != "" && !checkMultipleFilterStrings(filter.MatchReleases, r.TorrentName, r.Clean) {
r.addRejection("match release not matching")
return false
}
if filter.ExceptReleases != "" && checkFilterStrings(r.TorrentName, filter.ExceptReleases) {
if filter.ExceptReleases != "" && !checkMultipleFilterStrings(filter.ExceptReleases, r.TorrentName, r.Clean) {
r.addRejection("except_releases: unwanted release")
return false
}
if filter.MatchReleaseGroups != "" && !checkFilterStrings(r.Group, filter.MatchReleaseGroups) {
if filter.MatchReleaseGroups != "" && !checkMultipleFilterGroups(filter.MatchReleaseGroups, r.Group, r.Clean) {
r.addRejection("release groups not matching")
return false
}
if filter.ExceptReleaseGroups != "" && checkFilterStrings(r.Group, filter.ExceptReleaseGroups) {
if filter.ExceptReleaseGroups != "" && checkMultipleFilterGroups(filter.ExceptReleaseGroups, r.Group, r.Clean) {
r.addRejection("unwanted release group")
return false
}
@ -412,7 +569,7 @@ func (r *Release) CheckFilter(filter Filter) bool {
return false
}
if len(filter.Sources) > 0 && !checkFilterSlice(r.Source, filter.Sources) {
if len(filter.Sources) > 0 && !checkFilterSource(r.Source, filter.Sources) {
r.addRejection("source not matching")
return false
}
@ -441,15 +598,15 @@ func (r *Release) CheckFilter(filter Filter) bool {
return false
}
//if filter.Tags != "" && !checkFilterStrings(r.Tags, filter.Tags) {
// r.addRejection("tags not matching")
// return false
//}
//
//if filter.ExceptTags != "" && checkFilterStrings(r.Tags, filter.ExceptTags) {
// r.addRejection("unwanted tags")
// return false
//}
if filter.Tags != "" && !checkFilterTags(r.Tags, filter.Tags) {
r.addRejection("tags not matching")
return false
}
if filter.ExceptTags != "" && checkFilterTags(r.Tags, filter.ExceptTags) {
r.addRejection("unwanted tags")
return false
}
return true
}
@ -504,21 +661,21 @@ func (r *Release) CheckSizeFilter(minSize string, maxSize string) bool {
// MapVars better name
func (r *Release) MapVars(varMap map[string]string) error {
if torrentName, err := getFirstStringMapValue(varMap, []string{"torrentName"}); err != nil {
if torrentName, err := getStringMapValue(varMap, "torrentName"); err != nil {
return errors.Wrap(err, "failed parsing required field")
} else {
r.TorrentName = html.UnescapeString(torrentName)
}
if category, err := getFirstStringMapValue(varMap, []string{"category"}); err == nil {
if category, err := getStringMapValue(varMap, "category"); err == nil {
r.Category = category
}
if freeleech, err := getFirstStringMapValue(varMap, []string{"freeleech"}); err == nil {
if freeleech, err := getStringMapValue(varMap, "freeleech"); err == nil {
r.Freeleech = strings.EqualFold(freeleech, "freeleech") || strings.EqualFold(freeleech, "yes")
}
if freeleechPercent, err := getFirstStringMapValue(varMap, []string{"freeleechPercent"}); err == nil {
if freeleechPercent, err := getStringMapValue(varMap, "freeleechPercent"); err == nil {
// remove % and trim spaces
freeleechPercent = strings.Replace(freeleechPercent, "%", "", -1)
freeleechPercent = strings.Trim(freeleechPercent, " ")
@ -531,11 +688,11 @@ func (r *Release) MapVars(varMap map[string]string) error {
r.FreeleechPercent = freeleechPercentInt
}
if uploader, err := getFirstStringMapValue(varMap, []string{"uploader"}); err == nil {
if uploader, err := getStringMapValue(varMap, "uploader"); err == nil {
r.Uploader = uploader
}
if torrentSize, err := getFirstStringMapValue(varMap, []string{"torrentSize"}); err == nil {
if torrentSize, err := getStringMapValue(varMap, "torrentSize"); err == nil {
size, err := humanize.ParseBytes(torrentSize)
if err != nil {
// log could not parse into bytes
@ -544,47 +701,27 @@ func (r *Release) MapVars(varMap map[string]string) error {
// TODO implement other size checks in filter
}
if scene, err := getFirstStringMapValue(varMap, []string{"scene"}); err == nil {
if scene, err := getStringMapValue(varMap, "scene"); err == nil {
r.IsScene = strings.EqualFold(scene, "true") || strings.EqualFold(scene, "yes")
}
//if year, err := getFirstStringMapValue(varMap, []string{"year"}); err == nil {
// yearI, err := strconv.Atoi(year)
// if err != nil {
// //log.Debug().Msgf("bad year var: %v", year)
// }
// r.Year = yearI
//}
// TODO split this into two
if tags, err := getFirstStringMapValue(varMap, []string{"releaseTags", "tags"}); err == nil {
r.Tags = []string{tags}
if yearVal, err := getStringMapValue(varMap, "year"); err == nil {
year, err := strconv.Atoi(yearVal)
if err != nil {
//log.Debug().Msgf("bad year var: %v", year)
}
r.Year = year
}
// TODO parse releaseType
//if releaseType, err := getFirstStringMapValue(varMap, []string{"releaseType", "$releaseType"}); err == nil {
// r.Type = releaseType
//}
if tags, err := getStringMapValue(varMap, "tags"); err == nil {
tagArr := strings.Split(strings.ReplaceAll(tags, " ", ""), ",")
r.Tags = tagArr
}
//if cue, err := getFirstStringMapValue(varMap, []string{"cue", "$cue"}); err == nil {
// r.Cue = strings.EqualFold(cue, "true")
//}
//if logVar, err := getFirstStringMapValue(varMap, []string{"log", "$log"}); err == nil {
// r.Log = logVar
//}
//if media, err := getFirstStringMapValue(varMap, []string{"media", "$media"}); err == nil {
// r.Media = media
//}
//if format, err := getFirstStringMapValue(varMap, []string{"format", "$format"}); err == nil {
// r.Format = format
//}
//if bitRate, err := getFirstStringMapValue(varMap, []string{"bitrate", "$bitrate"}); err == nil {
// r.Bitrate = bitRate
//}
// handle releaseTags. Most of them are redundant but some are useful
if releaseTags, err := getStringMapValue(varMap, "releaseTags"); err == nil {
r.ReleaseTags = releaseTags
}
return nil
}
@ -639,6 +776,35 @@ func checkFilterStrings(name string, filterList string) bool {
return false
}
// checkMultipleFilterStrings check against multiple vars of unknown length
func checkMultipleFilterStrings(filterList string, vars ...string) bool {
filterSplit := strings.Split(filterList, ",")
for _, name := range vars {
name = strings.ToLower(name)
for _, s := range filterSplit {
s = strings.ToLower(s)
s = strings.Trim(s, " ")
// check if line contains * or ?, if so try wildcard match, otherwise try substring match
a := strings.ContainsAny(s, "?|*")
if a {
match := wildcard.Match(s, name)
if match {
return true
}
} else {
b := strings.Contains(name, s)
if b {
return true
}
}
}
}
return false
}
// checkFilterIntStrings "1,2,3-20"
func checkFilterIntStrings(value int, filterList string) bool {
filters := strings.Split(filterList, ",")
@ -685,6 +851,81 @@ func checkFilterIntStrings(value int, filterList string) bool {
return false
}
func checkMultipleFilterGroups(filterList string, vars ...string) bool {
filterSplit := strings.Split(filterList, ",")
for _, name := range vars {
name = strings.ToLower(name)
for _, s := range filterSplit {
s = strings.ToLower(strings.Trim(s, " "))
// check if line contains * or ?, if so try wildcard match, otherwise try substring match
a := strings.ContainsAny(s, "?|*")
if a {
match := wildcard.Match(s, name)
if match {
return true
}
} else {
split := SplitAny(name, " .-")
for _, c := range split {
if c == s {
return true
}
}
continue
}
}
}
return false
}
func checkFilterSource(name string, filterList []string) bool {
// remove dash (-) in blu-ray web-dl and make lowercase
name = strings.ToLower(strings.ReplaceAll(name, "-", ""))
for _, filter := range filterList {
// remove dash (-) in blu-ray web-dl, trim spaces and make lowercase
filter = strings.ToLower(strings.Trim(strings.ReplaceAll(filter, "-", ""), " "))
b := strings.Contains(name, filter)
if b {
return true
}
}
return false
}
func checkFilterTags(tags []string, filter string) bool {
filterTags := strings.Split(filter, ",")
for _, tag := range tags {
tag = strings.ToLower(tag)
for _, filter := range filterTags {
filter = strings.ToLower(filter)
filter = strings.Trim(filter, " ")
// check if line contains * or ?, if so try wildcard match, otherwise try substring match
a := strings.ContainsAny(filter, "?|*")
if a {
match := wildcard.Match(filter, tag)
if match {
return true
}
} else {
b := strings.Contains(tag, filter)
if b {
return true
}
}
}
}
return false
}
func checkFreeleechPercent(announcePercent int, filterPercent string) bool {
filters := strings.Split(filterPercent, ",")
@ -874,6 +1115,44 @@ func findLastInt(input string, pattern string) (int, error) {
return 0, nil
}
func SplitAny(s string, seps string) []string {
splitter := func(r rune) bool {
return strings.ContainsRune(seps, r)
}
return strings.FieldsFunc(s, splitter)
}
//func Splitter(s string, splits string) []string {
// m := make(map[rune]int)
// for _, r := range splits {
// m[r] = 1
// }
//
// splitter := func(r rune) bool {
// return m[r] == 1
// }
//
// return strings.FieldsFunc(s, splitter)
//}
//
//func canonicalizeString(s string) []string {
// //a := strings.FieldsFunc(s, split)
// a := Splitter(s, " .")
//
// return a
//}
func cleanReleaseName(input string) string {
// Make a Regex to say we only want letters and numbers
reg, err := regexp.Compile(`[\x00-\x1F\x2D\x2E\x5F\x7F]`)
if err != nil {
return ""
}
processedString := reg.ReplaceAllString(input, " ")
return processedString
}
type ReleaseStats struct {
TotalCount int64 `json:"total_count"`
FilteredCount int64 `json:"filtered_count"`