feat(feeds): improve RSS (#502)

* feat(feeds): improve rss * save last_run time * remove interval check * refactor feed job keys * add rss test * add max_age check * feat(feeds): rss basic freeleech parsing * feat(feeds): rss cookie support * feat(feeds): db get max_age * feat(feeds): update log messages * feat(feeds): pass cookie to release for download * feat(feeds): improve size parsing * feat(feeds): improve datetime check
2025-07-23 00:39:13 +00:00 · 2022-10-18 18:51:10 +02:00 · 2022-10-18 18:51:10 +02:00 · e2bb14afa4
commit e2bb14afa4
parent ac988f28f4
15 changed files with 741 additions and 209 deletions
--- a/internal/feed/rss.go
+++ b/internal/feed/rss.go
@ -2,8 +2,10 @@ package feed

 import (
 	"context"
+	"encoding/xml"
+	"fmt"
 	"net/url"
-	"sort"
+	"regexp"
 	"time"

 	"github.com/autobrr/autobrr/internal/domain"
@ -15,11 +17,13 @@ import (
 )

 type RSSJob struct {
+	Feed              *domain.Feed
 	Name              string
 	IndexerIdentifier string
 	Log               zerolog.Logger
 	URL               string
-	Repo              domain.FeedCacheRepo
+	Repo              domain.FeedRepo
+	CacheRepo         domain.FeedCacheRepo
 	ReleaseSvc        release.Service
 	Timeout           time.Duration

@ -29,13 +33,15 @@ type RSSJob struct {
 	JobID int
 }

-func NewRSSJob(name string, indexerIdentifier string, log zerolog.Logger, url string, repo domain.FeedCacheRepo, releaseSvc release.Service, timeout time.Duration) *RSSJob {
+func NewRSSJob(feed *domain.Feed, name string, indexerIdentifier string, log zerolog.Logger, url string, repo domain.FeedRepo, cacheRepo domain.FeedCacheRepo, releaseSvc release.Service, timeout time.Duration) *RSSJob {
 	return &RSSJob{
+		Feed:              feed,
 		Name:              name,
 		IndexerIdentifier: indexerIdentifier,
 		Log:               log,
 		URL:               url,
 		Repo:              repo,
+		CacheRepo:         cacheRepo,
 		ReleaseSvc:        releaseSvc,
 		Timeout:           timeout,
 	}
@ -43,7 +49,7 @@ func NewRSSJob(name string, indexerIdentifier string, log zerolog.Logger, url st

 func (j *RSSJob) Run() {
 	if err := j.process(); err != nil {
-		j.Log.Err(err).Int("attempts", j.attempts).Msg("rss feed process error")
+		j.Log.Error().Err(err).Int("attempts", j.attempts).Msg("rss feed process error")

 		j.errors = append(j.errors, err)
 		return
@ -71,9 +77,13 @@ func (j *RSSJob) process() error {
 	releases := make([]*domain.Release, 0)

 	for _, item := range items {
-		rls := j.processItem(item)
+		item := item
+		j.Log.Debug().Msgf("item: %v", item.Title)

-		releases = append(releases, rls)
+		rls := j.processItem(item)
+		if rls != nil {
+			releases = append(releases, rls)
+		}
 	}

 	// process all new releases
@ -83,6 +93,16 @@ func (j *RSSJob) process() error {
 }

 func (j *RSSJob) processItem(item *gofeed.Item) *domain.Release {
+	now := time.Now()
+
+	if j.Feed.MaxAge > 0 {
+		if item.PublishedParsed != nil {
+			if !isNewerThanMaxAge(j.Feed.MaxAge, *item.PublishedParsed, now) {
+				return nil
+			}
+		}
+	}
+
 	rls := domain.NewRelease(j.IndexerIdentifier)
 	rls.Implementation = domain.ReleaseImplementationRSS

@ -117,6 +137,8 @@ func (j *RSSJob) processItem(item *gofeed.Item) *domain.Release {
 	}

 	for _, v := range item.Categories {
+		rls.Categories = append(rls.Categories, item.Categories...)
+
 		if len(rls.Category) != 0 {
 			rls.Category += ", "
 		}
@ -138,6 +160,38 @@ func (j *RSSJob) processItem(item *gofeed.Item) *domain.Release {
 			rls.ParseSizeBytesString(sz)
 		}
 	}
+
+	// additional size parsing
+	// some feeds have a fixed size for enclosure so lets check for custom elements
+	// and parse size from there if it differs
+	if customTorrent, ok := item.Custom["torrent"]; ok {
+		var element itemCustomElement
+		if err := xml.Unmarshal([]byte("<torrent>"+customTorrent+"</torrent>"), &element); err != nil {
+			j.Log.Error().Err(err).Msg("could not unmarshal item.Custom.Torrent")
+		}
+
+		if element.ContentLength > 0 {
+			if uint64(element.ContentLength) != rls.Size {
+				rls.Size = uint64(element.ContentLength)
+			}
+		}
+
+		if rls.TorrentHash == "" && element.InfoHash != "" {
+			rls.TorrentHash = element.InfoHash
+		}
+	}
+
+	// basic freeleech parsing
+	if isFreeleech([]string{item.Title, item.Description}) {
+		rls.Freeleech = true
+		rls.Bonus = []string{"Freeleech"}
+	}
+
+	// add cookie to release for download if needed
+	if j.Feed.Cookie != "" {
+		rls.RawCookie = j.Feed.Cookie
+	}
+
 	return rls
 }

@ -145,51 +199,103 @@ func (j *RSSJob) getFeed() (items []*gofeed.Item, err error) {
 	ctx, cancel := context.WithTimeout(context.Background(), j.Timeout)
 	defer cancel()

-	feed, err := gofeed.NewParser().ParseURLWithContext(j.URL, ctx) // there's an RSS specific parser as well.
+	feed, err := NewFeedParser(j.Timeout, j.Feed.Cookie).ParseURLWithContext(ctx, j.URL)
 	if err != nil {
-		j.Log.Error().Err(err).Msgf("error fetching rss feed items")
 		return nil, errors.Wrap(err, "error fetching rss feed items")
 	}

+	// get feed as JSON string
+	feedData := feed.String()
+
+	if err := j.Repo.UpdateLastRunWithData(context.Background(), j.Feed.ID, feedData); err != nil {
+		j.Log.Error().Err(err).Msgf("error updating last run for feed id: %v", j.Feed.ID)
+	}
+
 	j.Log.Debug().Msgf("refreshing rss feed: %v, found (%d) items", j.Name, len(feed.Items))

 	if len(feed.Items) == 0 {
 		return
 	}

-	sort.Sort(feed)
+	bucketKey := fmt.Sprintf("%v+%v", j.IndexerIdentifier, j.Name)
+
+	//sort.Sort(feed)
+
+	bucketCount, err := j.CacheRepo.GetCountByBucket(ctx, bucketKey)
+	if err != nil {
+		j.Log.Error().Err(err).Msg("could not check if item exists")
+		return nil, err
+	}
+
+	// set ttl to 1 month
+	ttl := time.Now().AddDate(0, 1, 0)

 	for _, i := range feed.Items {
-		s := i.GUID
-		if len(s) == 0 {
-			s = i.Title
-			if len(s) == 0 {
+		item := i
+
+		key := item.GUID
+		if len(key) == 0 {
+			key = item.Title
+			if len(key) == 0 {
 				continue
 			}
 		}

-		exists, err := j.Repo.Exists(j.Name, s)
+		exists, err := j.CacheRepo.Exists(bucketKey, key)
 		if err != nil {
 			j.Log.Error().Err(err).Msg("could not check if item exists")
 			continue
 		}
 		if exists {
-			j.Log.Trace().Msgf("cache item exists, skipping release: %v", i.Title)
+			j.Log.Trace().Msgf("cache item exists, skipping release: %v", item.Title)
 			continue
 		}

-		// set ttl to 1 month
-		ttl := time.Now().AddDate(0, 1, 0)
-
-		if err := j.Repo.Put(j.Name, s, []byte(i.Title), ttl); err != nil {
-			j.Log.Error().Stack().Err(err).Str("entry", s).Msg("cache.Put: error storing item in cache")
+		if err := j.CacheRepo.Put(bucketKey, key, []byte(item.Title), ttl); err != nil {
+			j.Log.Error().Err(err).Str("entry", key).Msg("cache.Put: error storing item in cache")
 			continue
 		}

-		// only append if we successfully added to cache
-		items = append(items, i)
+		// first time we fetch the feed the cached bucket count will be 0
+		// only append to items if it's bigger than 0, so we get new items only
+		if bucketCount > 0 {
+			items = append(items, item)
+		}
 	}

 	// send to filters
 	return
 }
+
+func isNewerThanMaxAge(maxAge int, item, now time.Time) bool {
+	// now minus max age
+	nowMaxAge := now.Add(time.Duration(-maxAge) * time.Second)
+
+	if item.After(nowMaxAge) {
+		return true
+	}
+
+	return false
+}
+
+// isFreeleech basic freeleech parsing
+func isFreeleech(str []string) bool {
+	for _, s := range str {
+		var re = regexp.MustCompile(`(?mi)(\bfreeleech\b)`)
+
+		match := re.FindAllString(s, -1)
+
+		if len(match) > 0 {
+			return true
+		}
+	}
+
+	return false
+}
+
+// itemCustomElement
+// used for some feeds like Aviztas network
+type itemCustomElement struct {
+	ContentLength int64  `xml:"contentLength"`
+	InfoHash      string `xml:"infoHash"`
+}