diff --git a/src/cmd/frontend/main.go b/src/cmd/frontend/main.go index e0a79fa..b4bb595 100644 --- a/src/cmd/frontend/main.go +++ b/src/cmd/frontend/main.go @@ -4,10 +4,10 @@ import ( "crowsnest/internal/app" "crowsnest/internal/crawler" "crowsnest/internal/middleware" + "crowsnest/internal/model" "crowsnest/internal/model/database" "log" "net/http" - "time" _ "github.com/lib/pq" ) @@ -19,16 +19,16 @@ func main() { } // run web crawlers - coll := crawler.Crawler{ - Articles: &database.ArticleModel{DB: db}, - } - go func() { - for { - coll.ZeitCollectIndex() - coll.SpiegelCollectIndex() - time.Sleep(5 * time.Minute) - } - }() + articles := &database.ArticleModel{DB: db} + crawler := crawler.CrawlerFacade{} + crawler.Init() + + crawler.SubscribeToSpiegelFeed(func(a *model.Article) { + articles.Insert(a) + }) + crawler.SubscribeToZeitFeed(func(a *model.Article) { + articles.Insert(a) + }) // define app webapp := app.NewApp(db) diff --git a/src/internal/crawler/CrawlerFacade.go b/src/internal/crawler/CrawlerFacade.go new file mode 100644 index 0000000..17585a7 --- /dev/null +++ b/src/internal/crawler/CrawlerFacade.go @@ -0,0 +1,69 @@ +package crawler + +import ( + "crowsnest/internal/model" + "crowsnest/internal/util" + + "github.com/gocolly/colly/v2" +) + +type CrawlerFacade struct { + spiegelFeedDistributer *util.Distributer[*model.Article] + zeitFeedDistributer *util.Distributer[*model.Article] +} + +func (cf *CrawlerFacade) Init() { + // init + cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{} + cf.spiegelFeedDistributer.Init() + cf.zeitFeedDistributer = &util.Distributer[*model.Article]{} + cf.zeitFeedDistributer.Init() + + // run spiegel feed + sf := &WebFeed{} + sf.Init( + "https://www.spiegel.de/", + colly.AllowedDomains("www.spiegel.de", "spiegel.de"), + colly.CacheDir("./persistence/spiegel_cache"), + colly.MaxDepth(1), + ) + sf_feed := sf.Feed() + sf_converter := ConverterSpiegel{} + sf_converter.Init() + + go func() { + for val := range sf_feed { + article, err := sf_converter.Convert(val) + if err != nil { continue } + cf.spiegelFeedDistributer.Publish(article) + } + }() + + // run zeit feed + zf := &WebFeed{} + zf.Init( + "https://www.zeit.de/index", + colly.AllowedDomains("www.zeit.de", "zeit.de"), + colly.CacheDir("./persistence/zeit_cache"), + colly.MaxDepth(1), + ) + zf_feed := zf.Feed() + zf_converter := ZeitConverter{} + zf_converter.Init() + + go func() { + for val := range zf_feed { + article, err := zf_converter.Convert(val) + if err != nil { continue } + cf.zeitFeedDistributer.Publish(article) + } + }() +} + +func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) { + cf.spiegelFeedDistributer.Subscribe(hook) +} + +func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) { + cf.zeitFeedDistributer.Subscribe(hook) +} diff --git a/src/internal/crawler/Resource.go b/src/internal/crawler/Resource.go new file mode 100644 index 0000000..b2a4239 --- /dev/null +++ b/src/internal/crawler/Resource.go @@ -0,0 +1,6 @@ +package crawler + +type Resource struct { + Url string + Body string +} diff --git a/src/internal/crawler/SpiegelConverter.go b/src/internal/crawler/SpiegelConverter.go new file mode 100644 index 0000000..0d8240b --- /dev/null +++ b/src/internal/crawler/SpiegelConverter.go @@ -0,0 +1,96 @@ +package crawler + +import ( + "crowsnest/internal/model" + "errors" + "regexp" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" +) + +type ConverterSpiegel struct { + pattern_paywall *regexp.Regexp + pattern_url *regexp.Regexp + pattern_whitespace *regexp.Regexp +} + +func (c *ConverterSpiegel) Init() { + c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`) + c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`) + c.pattern_whitespace = regexp.MustCompile(`\s+`) +} + +func (c *ConverterSpiegel) Convert(res *Resource) (*model.Article, error) { + // check url url pattern + if !c.pattern_url.Match([]byte(res.Url)) { + return nil, errors.New("invalid url pattern") + } + + // check for paywall + if c.pattern_paywall.Match([]byte(res.Body)) { + return nil, errors.New("unable to extract article due to paywal") + } + + // construct goquery doc + doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body)) + if err != nil { + return nil, err + } + + // check for article type + tag := doc.Find("meta[property='og:type']") + pagetype, exists := tag.Attr("content") + if !exists || pagetype != "article" { + return nil, errors.New("unable to extract article, not of type article") + } + + // get title + tag = doc.Find("meta[property='og:title']") + title, exists := tag.Attr("content") + if !exists { + return nil, errors.New("unable to extract article, no title tag") + } + + // prepend description to content of article + tag = doc.Find("meta[name='description']") + content, exists := tag.Attr("content") + content += " " + if !exists { + return nil, errors.New("unable to extract article, no description tag") + } + + // get publishing date + tag = doc.Find("meta[name='date']") + datestr, exists := tag.Attr("content") + if !exists { + return nil, errors.New("unable to extract article, no date tag") + } + + date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr) + if err != nil { + return nil, err + } + + // get content + tag = doc.Find("main[id='Inhalt'] div > p") + + tag.Each(func(index int, p *goquery.Selection) { + content += " " + p.Text() + }) + + // clean up content string + content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" "))) + content = strings.ReplaceAll(content, "»", "\"") + content = strings.ReplaceAll(content, "«", "\"") + + // create new article + return &model.Article{ + SourceUrl: res.Url, + PublishDate: date, + FetchDate: time.Now(), + Title: title, + Content: content, + }, nil +} diff --git a/src/internal/crawler/WebFeed.go b/src/internal/crawler/WebFeed.go new file mode 100644 index 0000000..7363d3e --- /dev/null +++ b/src/internal/crawler/WebFeed.go @@ -0,0 +1,67 @@ +package crawler + +import ( + "crowsnest/internal/util" + "log" + "strings" + "time" + + "github.com/gocolly/colly/v2" +) + +type WebFeed struct { + feed chan *Resource + collector *colly.Collector +} + +// Init the WebFeed, starting the process of collecting Resources. +func (sf *WebFeed) Init(indexUrl string, options ...colly.CollectorOption) { + // create feed + sf.feed = make(chan *Resource, 100) + + // set cache, domain pattern and max recursion depth + sf.collector = colly.NewCollector(options...) + + // return IResources aka pages + sf.collector.OnResponse(func(r *colly.Response) { + url := r.Request.URL.String() + body := string(r.Body) + sf.feed <- &Resource{Url: url, Body: body} + }) + + // cascade + sf.collector.OnHTML("a[href]", func(e *colly.HTMLElement) { + url := e.Attr("href") + if !strings.HasPrefix(url, "http") { + return + } + e.Request.Visit(url) + }) + + // start runner + go sf.runner(indexUrl) +} + +// Get the channel into which the collected Resources will be written. +func (sf *WebFeed) Feed() <-chan *Resource { + return sf.feed +} + +func (sf *WebFeed) runner(indexUrl string) { + for { + // sleep for 5min + time.Sleep(time.Second * 300) + + // collect index + urls, err := util.GetAllURLs(indexUrl) + if err != nil { + log.Println("error in WebFeed runner: ", err.Error()) + continue + } + + // visit urls + for _, url := range urls { + sf.collector.Visit(url) + } + } +} diff --git a/src/internal/crawler/ZeitConverter.go b/src/internal/crawler/ZeitConverter.go new file mode 100644 index 0000000..3fe7845 --- /dev/null +++ b/src/internal/crawler/ZeitConverter.go @@ -0,0 +1,100 @@ +package crawler + +import ( + "crowsnest/internal/model" + "errors" + "regexp" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" +) + +type ZeitConverter struct { + pattern_url *regexp.Regexp + pattern_whitespace *regexp.Regexp +} + +func (c *ZeitConverter) Init() { + c.pattern_url = regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`) + c.pattern_whitespace = regexp.MustCompile(`\s+`) +} + +func (c *ZeitConverter) Convert(res *Resource) (*model.Article, error) { + // check url url pattern + if !c.pattern_url.Match([]byte(res.Url)) { + return nil, errors.New("invalid url pattern") + } + + // construct goquery doc + doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body)) + if err != nil { + return nil, err + } + + // check for article type + tag := doc.Find("meta[property='og:type']") + pagetype, exists := tag.Attr("content") + if !exists || pagetype != "article" { + return nil, errors.New("unable to extract article, not of type article") + } + + // check for paywall + tag = doc.Find("meta[property='article:content_tier']") + pagetype, exists = tag.Attr("content") + if !exists || pagetype != "free" { + return nil, errors.New("unable to extract article due to paywal") + } + + // get title + tag = doc.Find("meta[property='og:title']") + title, exists := tag.Attr("content") + if !exists { + return nil, errors.New("unable to extract article, no title tag") + } + + // prepend description to content of article + tag = doc.Find("meta[name='description']") + content, exists := tag.Attr("content") + content += " " + if !exists { + return nil, errors.New("unable to extract article, no description tag") + } + + if strings.Contains(content, "Das Liveblog") { + return nil, errors.New("unable to extract article, no support for liveblog") + } + + // get publishing date + tag = doc.Find("meta[name='date']") + datestr, exists := tag.Attr("content") + if !exists { + return nil, errors.New("unable to extract article, no date tag") + } + + date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr) + if err != nil { + return nil, err + } + + // get content + tag = doc.Find("main > article > div.article-body p.article__item") + + tag.Each(func(index int, p *goquery.Selection) { + content += " " + p.Text() + }) + + // clean up content string + content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" "))) + content = strings.ReplaceAll(content, "»", "\"") + content = strings.ReplaceAll(content, "«", "\"") + + // create new article + return &model.Article{ + SourceUrl: res.Url, + PublishDate: date, + FetchDate: time.Now(), + Title: title, + Content: content, + }, nil +} diff --git a/src/internal/crawler/spiegel.go b/src/internal/crawler/spiegel.go deleted file mode 100644 index 2ae5a1c..0000000 --- a/src/internal/crawler/spiegel.go +++ /dev/null @@ -1,152 +0,0 @@ -package crawler - -import ( - "crowsnest/internal/model" - "errors" - "fmt" - "regexp" - "strings" - "time" - - "github.com/PuerkitoBio/goquery" - "github.com/gocolly/colly/v2" -) - -func (c *Crawler) SpiegelCollector() *colly.Collector { - // set cache, domain pattern and max recursion deepth - collector := colly.NewCollector( - colly.AllowedDomains("www.spiegel.de", "spiegel.de"), - colly.CacheDir("./persistence/spiegel_cache"), - colly.MaxDepth(5), - ) - // store articles - collector.OnResponse(func(r *colly.Response) { - url := r.Request.URL.String() - c.SpiegelExtract(url, r.Body) - }) - // cascade - collector.OnHTML("a[href]", func(e *colly.HTMLElement) { - url := e.Attr("href") - if !strings.HasPrefix(url, "http") { - return - } - e.Request.Visit(url) - }) - - return collector -} - -func (c *Crawler) SpiegelCollectIndex() error { - urls, err := c.GetAllURLs("https://www.spiegel.de/") - if err != nil { - return err - } - - collector := c.SpiegelCollector() - collector.MaxDepth = 1 - - for _, url := range urls { - collector.Visit(url) - } - return nil -} - -func (c *Crawler) SpiegelCollectArchive() { - collector := c.SpiegelCollector() - - // go through archive - startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC) - currentDate := time.Now() - - for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) { - urlDate := date.Format("02.01.2006") - url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate) - - collector.Visit(url) - } -} - -func (c *Crawler) SpiegelExtract(url string, body []byte) error { - paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`) - url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`) - whitespace := regexp.MustCompile(`\s+`) - - var exists bool - var pagetype, title, content, datestr string - var tag *goquery.Selection - var date time.Time - - // check url url pattern - if !url_pattern.Match([]byte(url)) { - return errors.New("invalid url pattern") - } - - // check for paywall - if paywall_pattern.Match(body) { - return errors.New("unable to extract article due to paywal") - } - - // construct goquery doc - doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body))) - if err != nil { - return err - } - - // check for article type - tag = doc.Find("meta[property='og:type']") - pagetype, exists = tag.Attr("content") - if !exists || pagetype != "article" { - return errors.New("unable to extract article, not of type article") - } - - // get title - tag = doc.Find("meta[property='og:title']") - title, exists = tag.Attr("content") - if !exists { - return errors.New("unable to extract article, no title tag") - } - - // prepend description to content of article - tag = doc.Find("meta[name='description']") - content, exists = tag.Attr("content") - content += " " - if !exists { - return errors.New("unable to extract article, no description tag") - } - - // get publishing date - tag = doc.Find("meta[name='date']") - datestr, exists = tag.Attr("content") - if !exists { - return errors.New("unable to extract article, no date tag") - } - - date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) - if err != nil { - return err - } - - // get content - tag = doc.Find("main[id='Inhalt'] div > p") - - tag.Each(func(index int, p *goquery.Selection) { - content += " " + p.Text() - }) - - // clean up content string - content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) - content = strings.ReplaceAll(content, "»", "\"") - content = strings.ReplaceAll(content, "«", "\"") - - // insert new article - article := model.Article{ - SourceUrl: url, - PublishDate: date, - FetchDate: time.Now(), - Title: title, - Content: content, - } - - err = c.Articles.Insert(&article) - return err -} diff --git a/src/internal/crawler/zeit.go b/src/internal/crawler/zeit.go deleted file mode 100644 index 90d4e5e..0000000 --- a/src/internal/crawler/zeit.go +++ /dev/null @@ -1,160 +0,0 @@ -package crawler - -import ( - "crowsnest/internal/model" - "errors" - "fmt" - "regexp" - "strings" - "time" - - "github.com/PuerkitoBio/goquery" - "github.com/gocolly/colly/v2" -) - -func (c *Crawler) ZeitCollector() *colly.Collector { - // set cache, domain pattern and max recursion deepth - collector := colly.NewCollector( - colly.AllowedDomains("www.zeit.de", "zeit.de"), - colly.CacheDir("./persistence/zeit_cache"), - colly.MaxDepth(5), - ) - // store articles - collector.OnResponse(func(r *colly.Response) { - url := r.Request.URL.String() - c.ZeitExtract(url, r.Body) - }) - // cascade - collector.OnHTML("a[href]", func(e *colly.HTMLElement) { - url := e.Attr("href") - if !strings.HasPrefix(url, "http") { - return - } - e.Request.Visit(url) - }) - - return collector -} - -func (c *Crawler) ZeitCollectIndex() error { - urls, err := c.GetAllURLs("https://www.zeit.de/index") - if err != nil { - return err - } - - collector := c.ZeitCollector() - collector.MaxDepth = 1 - - for _, url := range urls { - collector.Visit(url) - } - return nil -} - -// Gets every page of the archive of zeit.de and stores the responses into the -// database. -func (c *Crawler) ZeitCollectArchive() { - collector := c.ZeitCollector() - - // go through archive - startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC) - //startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC) - currentDate := time.Now() - - for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) { - year, week := date.ISOWeek() - url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week) - - collector.Visit(url) - } -} - -func (c *Crawler) ZeitExtract(url string, body []byte) error { - url_pattern := regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`) - whitespace := regexp.MustCompile(`\s+`) - - var exists bool - var pagetype, title, content, datestr string - var tag *goquery.Selection - var date time.Time - - // check url url pattern - if !url_pattern.Match([]byte(url)) { - return errors.New("invalid url pattern") - } - - // construct goquery doc - doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body))) - if err != nil { - return err - } - - // check for article type - tag = doc.Find("meta[property='og:type']") - pagetype, exists = tag.Attr("content") - if !exists || pagetype != "article" { - return errors.New("unable to extract article, not of type article") - } - - // check for paywall - tag = doc.Find("meta[property='article:content_tier']") - pagetype, exists = tag.Attr("content") - if !exists || pagetype != "free" { - return errors.New("unable to extract article due to paywal") - } - - // get title - tag = doc.Find("meta[property='og:title']") - title, exists = tag.Attr("content") - if !exists { - return errors.New("unable to extract article, no title tag") - } - - // prepend description to content of article - tag = doc.Find("meta[name='description']") - content, exists = tag.Attr("content") - content += " " - if !exists { - return errors.New("unable to extract article, no description tag") - } - - if strings.Contains(content, "Das Liveblog") { - return errors.New("unable to extract article, no support for liveblog") - } - - // get publishing date - tag = doc.Find("meta[name='date']") - datestr, exists = tag.Attr("content") - if !exists { - return errors.New("unable to extract article, no date tag") - } - - date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) - if err != nil { - return err - } - - // get content - tag = doc.Find("main > article > div.article-body p.article__item") - - tag.Each(func(index int, p *goquery.Selection) { - content += " " + p.Text() - }) - - // clean up content string - content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) - content = strings.ReplaceAll(content, "»", "\"") - content = strings.ReplaceAll(content, "«", "\"") - - // insert new article - article := model.Article{ - SourceUrl: url, - PublishDate: date, - FetchDate: time.Now(), - Title: title, - Content: content, - } - - err = c.Articles.Insert(&article) - return err -} diff --git a/src/internal/model/article.go b/src/internal/model/article.go index 8bd9232..4f24834 100644 --- a/src/internal/model/article.go +++ b/src/internal/model/article.go @@ -16,6 +16,18 @@ type Article struct { AiSummary string } +func (a *Article) Clone() *Article { + return &Article{ + Id: a.Id, + SourceUrl: a.SourceUrl, + PublishDate: a.PublishDate, + FetchDate: a.FetchDate, + Title: a.Title, + Content: a.Content, + AiSummary: a.AiSummary, + } +} + // TODO docstring type ArticleViewModel struct { Id int diff --git a/src/internal/util/Converter.go b/src/internal/util/Converter.go new file mode 100644 index 0000000..bd2ce01 --- /dev/null +++ b/src/internal/util/Converter.go @@ -0,0 +1,3 @@ +package util + +type Converter[I any, O any] func(I) (O, error) diff --git a/src/internal/util/Distributer.go b/src/internal/util/Distributer.go new file mode 100644 index 0000000..46f5a01 --- /dev/null +++ b/src/internal/util/Distributer.go @@ -0,0 +1,37 @@ +package util + +type Distributer[T IClone[T]] struct { + queue chan T + hooks []func(T) +} + +func (d *Distributer[T]) Init() { + d.queue = make(chan T, 100) + d.hooks = make([]func(T), 0) +} + +// Distribute a copy of an item to every hook that has described to this +// Collector. +func (d *Distributer[T]) Publish(item T) { + d.queue <- item +} + +// Add a new hook to the Collector. The hook will be called async whenever a +// new item is published. +func (d *Distributer[T]) Subscribe(hook func(T)) { + d.hooks = append(d.hooks, hook) + if len(d.hooks) == 1 { + go d.runner() + } +} + +// Will be started to run async when Subscribe is first called. Whenever +// Publish is called the runner will distribute a clone of the new item to +// every hook. +func (d *Distributer[T]) runner() { + for val := range d.queue { + for _, f := range d.hooks { + go f(val.Clone()) + } + } +} diff --git a/src/internal/util/IClone.go b/src/internal/util/IClone.go new file mode 100644 index 0000000..e31801c --- /dev/null +++ b/src/internal/util/IClone.go @@ -0,0 +1,5 @@ +package util + +type IClone[T any] interface { + Clone() T +} diff --git a/src/internal/crawler/crawler.go b/src/internal/util/web.go similarity index 84% rename from src/internal/crawler/crawler.go rename to src/internal/util/web.go index 80fee97..dd4a392 100644 --- a/src/internal/crawler/crawler.go +++ b/src/internal/util/web.go @@ -1,19 +1,14 @@ -package crawler +package util import ( - "crowsnest/internal/model/database" "fmt" "net/http" "github.com/PuerkitoBio/goquery" ) -type Crawler struct { - Articles *database.ArticleModel -} - // GetAllURLs fetches all URLs from a given web page URL -func (c *Crawler) GetAllURLs(pageURL string) ([]string, error) { +func GetAllURLs(pageURL string) ([]string, error) { // Send a GET request to the provided URL resp, err := http.Get(pageURL) if err != nil {