From b16ebb95729976fad13c18c16c0a74ccf41f13ec Mon Sep 17 00:00:00 2001 From: Elias Kohout Date: Tue, 7 Jan 2025 11:41:17 +0100 Subject: [PATCH] add zeit.de collector --- cmd/crawler/collectors/spiegel.go | 8 +- cmd/crawler/collectors/zeit.go | 124 ++++++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 12 deletions(-) diff --git a/cmd/crawler/collectors/spiegel.go b/cmd/crawler/collectors/spiegel.go index 3f1dcbe..08dbd9b 100644 --- a/cmd/crawler/collectors/spiegel.go +++ b/cmd/crawler/collectors/spiegel.go @@ -13,17 +13,17 @@ import ( "github.com/gocolly/colly/v2" ) -func (c *Collector) SpiegelCollect() { +func (c *Collector) CollectSpiegel() { collycollector := colly.NewCollector( colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./persistence/spiegel_cache"), colly.MaxDepth(5), ) - // cache + // store articles collycollector.OnResponse(func(r *colly.Response) { url := r.Request.URL.String() - err := c.SpiegelExtract(url, r.Body) + err := c.ExtractSpiegel(url, r.Body) if err == nil { log.Println("added article", url) } else { @@ -53,7 +53,7 @@ func (c *Collector) SpiegelCollect() { } } -func (c *Collector) SpiegelExtract(url string, body []byte) error { +func (c *Collector) ExtractSpiegel(url string, body []byte) error { paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`) url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`) whitespace := regexp.MustCompile(`\s+`) diff --git a/cmd/crawler/collectors/zeit.go b/cmd/crawler/collectors/zeit.go index 621d9dd..606b988 100644 --- a/cmd/crawler/collectors/zeit.go +++ b/cmd/crawler/collectors/zeit.go @@ -1,33 +1,51 @@ package collectors import ( + "crowsnest/internal/model" + "errors" "fmt" + "log" + "regexp" + "strings" "time" + "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly/v2" ) // Gets every page of the archive of zeit.de and stores the responses into the // database. -func (c *Collector) Zeit() { +func (c *Collector) CollectZeit() { collycollector := colly.NewCollector( colly.AllowedDomains("www.zeit.de", "zeit.de"), colly.CacheDir("./persistence/zeit_cache"), - colly.MaxDepth(2), + colly.MaxDepth(5), ) + // store articles + collycollector.OnResponse(func(r *colly.Response) { + url := r.Request.URL.String() + err := c.ExtractZeit(url, r.Body) + if err == nil { + log.Println("added article", url) + } else { + log.Println("failed to add article:", err, "("+url+")") + } + }) + // cascade collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { - e.Request.Visit(e.Attr("href")) - }) + url := e.Attr("href") - // cache - collycollector.OnScraped(func(r *colly.Response) { - c.Responses.Insert(r.Request.URL.String(), r.Body) + if !strings.HasPrefix(url, "http") { + return + } + e.Request.Visit(url) }) // go through archive - startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC) + startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) + //startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC) currentDate := time.Now() for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) { @@ -37,3 +55,93 @@ func (c *Collector) Zeit() { collycollector.Visit(url) } } + +func (c *Collector) ExtractZeit(url string, body []byte) error { + url_pattern := regexp.MustCompile(`^https://(www\.)?zeit.de.*`) + whitespace := regexp.MustCompile(`\s+`) + + var exists bool + var pagetype, title, content, datestr string + var tag *goquery.Selection + var date time.Time + + // check url url pattern + if !url_pattern.Match([]byte(url)) { + return errors.New("invalid url pattern") + } + + // construct goquery doc + doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body))) + if err != nil { + return err + } + + // check for article type + tag = doc.Find("meta[property='og:type']") + pagetype, exists = tag.Attr("content") + if !exists || pagetype != "article" { + return errors.New("unable to extract article, not of type article") + } + + // check for paywall + tag = doc.Find("meta[property='article:content_tier']") + pagetype, exists = tag.Attr("content") + if !exists || pagetype != "free" { + return errors.New("unable to extract article due to paywal") + } + + // get title + tag = doc.Find("meta[property='og:title']") + title, exists = tag.Attr("content") + if !exists { + return errors.New("unable to extract article, no title tag") + } + + // prepend description to content of article + tag = doc.Find("meta[name='description']") + content, exists = tag.Attr("content") + content += " " + if !exists { + return errors.New("unable to extract article, no description tag") + } + + if strings.Contains(content, "Das Liveblog") { + return errors.New("unable to extract article, no support for liveblog") + } + + // get publishing date + tag = doc.Find("meta[name='date']") + datestr, exists = tag.Attr("content") + if !exists { + return errors.New("unable to extract article, no date tag") + } + + date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) + if err != nil { + return err + } + + // get content + tag = doc.Find("main > article > div.article-body p.article__item") + + tag.Each(func(index int, p *goquery.Selection) { + content += " " + p.Text() + }) + + // clean up content string + content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) + content = strings.ReplaceAll(content, "»", "\"") + content = strings.ReplaceAll(content, "«", "\"") + + // insert new article + article := model.Article{ + SourceUrl: url, + PublishDate: date, + FetchDate: time.Now(), + Title: title, + Content: content, + } + + err = c.Articles.Insert(&article) + return err +}