rename spiegel converter

This commit is contained in:
2025-03-24 13:34:37 +01:00
parent 9687f327fe
commit b7b4e74eff
3 changed files with 48 additions and 44 deletions

View File

@@ -67,7 +67,7 @@ func main() {
// define app // define app
webapp := app.NewApp(db) webapp := app.NewApp(db)
// create middle // create middleware
stack := middleware.CreateStack( stack := middleware.CreateStack(
middleware.Logging, middleware.Logging,
) )

View File

@@ -8,62 +8,66 @@ import (
) )
type CrawlerFacade struct { type CrawlerFacade struct {
spiegelFeedDistributer *util.Distributer[*model.Article] spiegelFeedDistributer *util.Distributer[*model.Article]
zeitFeedDistributer *util.Distributer[*model.Article] zeitFeedDistributer *util.Distributer[*model.Article]
} }
func (cf *CrawlerFacade) Init() { func (cf *CrawlerFacade) Init() {
// init // init
cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{} cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{}
cf.spiegelFeedDistributer.Init() cf.spiegelFeedDistributer.Init()
cf.zeitFeedDistributer = &util.Distributer[*model.Article]{} cf.zeitFeedDistributer = &util.Distributer[*model.Article]{}
cf.zeitFeedDistributer.Init() cf.zeitFeedDistributer.Init()
// run spiegel feed // run spiegel feed
sf := &WebFeed{} sf := &WebFeed{}
sf.Init( sf.Init(
"https://www.spiegel.de/", "https://www.spiegel.de/",
colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
colly.CacheDir("./persistence/spiegel_cache"), colly.CacheDir("./persistence/spiegel_cache"),
colly.MaxDepth(1), colly.MaxDepth(1),
) )
sf_feed := sf.Feed() sf_feed := sf.Feed()
sf_converter := ConverterSpiegel{} sf_converter := SpiegelConverter{}
sf_converter.Init() sf_converter.Init()
go func() { go func() {
for val := range sf_feed { for val := range sf_feed {
article, err := sf_converter.Convert(val) article, err := sf_converter.Convert(val)
if err != nil { continue } if err != nil {
cf.spiegelFeedDistributer.Publish(article) continue
} }
}() cf.spiegelFeedDistributer.Publish(article)
}
}()
// run zeit feed // run zeit feed
zf := &WebFeed{} zf := &WebFeed{}
zf.Init( zf.Init(
"https://www.zeit.de/index", "https://www.zeit.de/index",
colly.AllowedDomains("www.zeit.de", "zeit.de"), colly.AllowedDomains("www.zeit.de", "zeit.de"),
colly.CacheDir("./persistence/zeit_cache"), colly.CacheDir("./persistence/zeit_cache"),
colly.MaxDepth(1), colly.MaxDepth(1),
) )
zf_feed := zf.Feed() zf_feed := zf.Feed()
zf_converter := ZeitConverter{} zf_converter := ZeitConverter{}
zf_converter.Init() zf_converter.Init()
go func() { go func() {
for val := range zf_feed { for val := range zf_feed {
article, err := zf_converter.Convert(val) article, err := zf_converter.Convert(val)
if err != nil { continue } if err != nil {
cf.zeitFeedDistributer.Publish(article) continue
} }
}() cf.zeitFeedDistributer.Publish(article)
}
}()
} }
func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) { func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) {
cf.spiegelFeedDistributer.Subscribe(hook) cf.spiegelFeedDistributer.Subscribe(hook)
} }
func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) { func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) {
cf.zeitFeedDistributer.Subscribe(hook) cf.zeitFeedDistributer.Subscribe(hook)
} }

View File

@@ -10,19 +10,19 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
type ConverterSpiegel struct { type SpiegelConverter struct {
pattern_paywall *regexp.Regexp pattern_paywall *regexp.Regexp
pattern_url *regexp.Regexp pattern_url *regexp.Regexp
pattern_whitespace *regexp.Regexp pattern_whitespace *regexp.Regexp
} }
func (c *ConverterSpiegel) Init() { func (c *SpiegelConverter) Init() {
c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`) c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`) c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
c.pattern_whitespace = regexp.MustCompile(`\s+`) c.pattern_whitespace = regexp.MustCompile(`\s+`)
} }
func (c *ConverterSpiegel) Convert(res *Resource) (*model.Article, error) { func (c *SpiegelConverter) Convert(res *Resource) (*model.Article, error) {
// check url url pattern // check url url pattern
if !c.pattern_url.Match([]byte(res.Url)) { if !c.pattern_url.Match([]byte(res.Url)) {
return nil, errors.New("invalid url pattern") return nil, errors.New("invalid url pattern")