diff --git a/src/cmd/frontend/main.go b/src/cmd/frontend/main.go index 6de08b3..cd7a50f 100644 --- a/src/cmd/frontend/main.go +++ b/src/cmd/frontend/main.go @@ -67,7 +67,7 @@ func main() { // define app webapp := app.NewApp(db) - // create middle + // create middleware stack := middleware.CreateStack( middleware.Logging, ) diff --git a/src/internal/crawler/crawlerfacade.go b/src/internal/crawler/crawlerfacade.go index 17585a7..6dfba7d 100644 --- a/src/internal/crawler/crawlerfacade.go +++ b/src/internal/crawler/crawlerfacade.go @@ -8,62 +8,66 @@ import ( ) type CrawlerFacade struct { - spiegelFeedDistributer *util.Distributer[*model.Article] - zeitFeedDistributer *util.Distributer[*model.Article] + spiegelFeedDistributer *util.Distributer[*model.Article] + zeitFeedDistributer *util.Distributer[*model.Article] } func (cf *CrawlerFacade) Init() { - // init - cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{} - cf.spiegelFeedDistributer.Init() - cf.zeitFeedDistributer = &util.Distributer[*model.Article]{} - cf.zeitFeedDistributer.Init() - - // run spiegel feed - sf := &WebFeed{} - sf.Init( - "https://www.spiegel.de/", + // init + cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{} + cf.spiegelFeedDistributer.Init() + cf.zeitFeedDistributer = &util.Distributer[*model.Article]{} + cf.zeitFeedDistributer.Init() + + // run spiegel feed + sf := &WebFeed{} + sf.Init( + "https://www.spiegel.de/", colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./persistence/spiegel_cache"), colly.MaxDepth(1), - ) - sf_feed := sf.Feed() - sf_converter := ConverterSpiegel{} - sf_converter.Init() + ) + sf_feed := sf.Feed() + sf_converter := SpiegelConverter{} + sf_converter.Init() - go func() { - for val := range sf_feed { - article, err := sf_converter.Convert(val) - if err != nil { continue } - cf.spiegelFeedDistributer.Publish(article) - } - }() + go func() { + for val := range sf_feed { + article, err := sf_converter.Convert(val) + if err != nil { + continue + } + cf.spiegelFeedDistributer.Publish(article) + } + }() - // run zeit feed - zf := &WebFeed{} - zf.Init( - "https://www.zeit.de/index", + // run zeit feed + zf := &WebFeed{} + zf.Init( + "https://www.zeit.de/index", colly.AllowedDomains("www.zeit.de", "zeit.de"), colly.CacheDir("./persistence/zeit_cache"), colly.MaxDepth(1), - ) - zf_feed := zf.Feed() - zf_converter := ZeitConverter{} - zf_converter.Init() + ) + zf_feed := zf.Feed() + zf_converter := ZeitConverter{} + zf_converter.Init() - go func() { - for val := range zf_feed { - article, err := zf_converter.Convert(val) - if err != nil { continue } - cf.zeitFeedDistributer.Publish(article) - } - }() + go func() { + for val := range zf_feed { + article, err := zf_converter.Convert(val) + if err != nil { + continue + } + cf.zeitFeedDistributer.Publish(article) + } + }() } func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) { - cf.spiegelFeedDistributer.Subscribe(hook) + cf.spiegelFeedDistributer.Subscribe(hook) } func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) { - cf.zeitFeedDistributer.Subscribe(hook) + cf.zeitFeedDistributer.Subscribe(hook) } diff --git a/src/internal/crawler/spiegelconverter.go b/src/internal/crawler/spiegelconverter.go index 0d8240b..99c0c85 100644 --- a/src/internal/crawler/spiegelconverter.go +++ b/src/internal/crawler/spiegelconverter.go @@ -10,19 +10,19 @@ import ( "github.com/PuerkitoBio/goquery" ) -type ConverterSpiegel struct { +type SpiegelConverter struct { pattern_paywall *regexp.Regexp pattern_url *regexp.Regexp pattern_whitespace *regexp.Regexp } -func (c *ConverterSpiegel) Init() { +func (c *SpiegelConverter) Init() { c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`) c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`) c.pattern_whitespace = regexp.MustCompile(`\s+`) } -func (c *ConverterSpiegel) Convert(res *Resource) (*model.Article, error) { +func (c *SpiegelConverter) Convert(res *Resource) (*model.Article, error) { // check url url pattern if !c.pattern_url.Match([]byte(res.Url)) { return nil, errors.New("invalid url pattern")