cmd/crawler/collectors/spiegel.go

package collectors

import (
	"crowsnest/internal/model"
	"errors"
	"fmt"
	"log"
	"regexp"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	"github.com/gocolly/colly/v2"
)

func (c *Collector) SpiegelCollect() {
	collycollector := colly.NewCollector(
		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
		colly.CacheDir("./persistence/spiegel_cache"),
		colly.MaxDepth(5),
	)

	// cache
	collycollector.OnResponse(func(r *colly.Response) {
		url := r.Request.URL.String()
		err := c.SpiegelExtract(url, r.Body)
		if err == nil {
			log.Println("added article", url)
		} else {
			log.Println("failed to add article:", err, "("+url+")")
		}
	})

	// cascade
	collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
		url := e.Attr("href")

		if !strings.HasPrefix(url, "http") {
			return
		}
		e.Request.Visit(url)
	})

	// go through archive
	startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC)
	currentDate := time.Now()

	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) {
		urlDate := date.Format("02.01.2006")
		url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate)

		collycollector.Visit(url)
	}
}

func (c *Collector) SpiegelExtract(url string, body []byte) error {
	paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
	url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
	whitespace := regexp.MustCompile(`\s+`)

	var exists bool
	var pagetype, title, content, datestr, author string
	var tag *goquery.Selection
	var date time.Time

	// check url url pattern
	if !url_pattern.Match([]byte(url)) {
		return errors.New("invalid url pattern")
	}

	// check for paywall
	if paywall_pattern.Match(body) {
		return errors.New("unable to extract article due to paywal")
	}

	// construct goquery doc
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
	if err != nil {
		return err
	}

	// check for article type
	tag = doc.Find("meta[property='og:type']")
	pagetype, exists = tag.Attr("content")
	if !exists || pagetype != "article" {
		return errors.New("unable to extract article, not of type article")
	}

	// get title
	tag = doc.Find("meta[property='og:title']")
	title, exists = tag.Attr("content")
	if !exists {
		return errors.New("unable to extract article, no title tag")
	}

	// prepend description to content of article
	tag = doc.Find("meta[name='description']")
	content, exists = tag.Attr("content")
	content += " "
	if !exists {
		return errors.New("unable to extract article, no description tag")
	}

	// get publishing date
	tag = doc.Find("meta[name='date']")
	datestr, exists = tag.Attr("content")
	if !exists {
		return errors.New("unable to extract article, no date tag")
	}

	date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
	if err != nil {
		return err
	}

	// get author
	tag = doc.Find("meta[name='author']")
	author, exists = tag.Attr("content")
	if !exists {
		return errors.New("unable to extract article, no author tag")
	}

	// get content
	tag = doc.Find("main[id='Inhalt'] div > p")

	tag.Each(func(index int, p *goquery.Selection) {
		content += " " + p.Text()
	})

	// clean up content string
	content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
	content = strings.ReplaceAll(content, "»", "\"")
	content = strings.ReplaceAll(content, "«", "\"")

	// insert new article
	article := model.Article{
		SourceUrl:   url,
		PublishDate: date,
		FetchDate:   time.Now(),
		Title:       title,
		Content:     content,
		Author:      author,
	}

	err = c.Articles.Insert(&article)
	return err
}
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`package collectors`

			`import (`
change db from sqlite3 to postgresql 2025-01-07 09:32:57 +01:00			`"crowsnest/internal/model"`
remove caching in database for performance reasons 2025-01-07 10:14:16 +01:00			`"errors"`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`"fmt"`
change db from sqlite3 to postgresql 2025-01-07 09:32:57 +01:00			`"log"`
remove caching in database for performance reasons 2025-01-07 10:14:16 +01:00			`"regexp"`
change db from sqlite3 to postgresql 2025-01-07 09:32:57 +01:00			`"strings"`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`"time"`

remove caching in database for performance reasons 2025-01-07 10:14:16 +01:00			`"github.com/PuerkitoBio/goquery"`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`"github.com/gocolly/colly/v2"`
			`)`

remove caching in database for performance reasons 2025-01-07 10:14:16 +01:00			`func (c *Collector) SpiegelCollect() {`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`collycollector := colly.NewCollector(`
			`colly.AllowedDomains("www.spiegel.de", "spiegel.de"),`
			`colly.CacheDir("./persistence/spiegel_cache"),`
remove caching in database for performance reasons 2025-01-07 10:14:16 +01:00			`colly.MaxDepth(5),`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`)`

change db from sqlite3 to postgresql 2025-01-07 09:32:57 +01:00			`// cache`
			`collycollector.OnResponse(func(r *colly.Response) {`
			`url := r.Request.URL.String()`
remove caching in database for performance reasons 2025-01-07 10:14:16 +01:00			`err := c.SpiegelExtract(url, r.Body)`
			`if err == nil {`
			`log.Println("added article", url)`
			`} else {`
			`log.Println("failed to add article:", err, "("+url+")")`
			`}`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`})`

change db from sqlite3 to postgresql 2025-01-07 09:32:57 +01:00			`// cascade`
			`collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {`
			`url := e.Attr("href")`

			`if !strings.HasPrefix(url, "http") {`
			`return`
			`}`
			`e.Request.Visit(url)`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`})`

change db from sqlite3 to postgresql 2025-01-07 09:32:57 +01:00			`// go through archive`
			`startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC)`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`currentDate := time.Now()`

			`for date := startDate; date.Before(currentDate) \|\| date.Equal(currentDate); date = date.AddDate(0, 0, 1) {`
			`urlDate := date.Format("02.01.2006")`
			`url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate)`

change db from sqlite3 to postgresql 2025-01-07 09:32:57 +01:00			`collycollector.Visit(url)`
			`}`
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`}`
remove caching in database for performance reasons 2025-01-07 10:14:16 +01:00
			`func (c *Collector) SpiegelExtract(url string, body []byte) error {`
			paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
			url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
			whitespace := regexp.MustCompile(`\s+`)

			`var exists bool`
			`var pagetype, title, content, datestr, author string`
			`var tag *goquery.Selection`
			`var date time.Time`

			`// check url url pattern`
			`if !url_pattern.Match([]byte(url)) {`
			`return errors.New("invalid url pattern")`
			`}`

			`// check for paywall`
			`if paywall_pattern.Match(body) {`
			`return errors.New("unable to extract article due to paywal")`
			`}`

			`// construct goquery doc`
			`doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))`
			`if err != nil {`
			`return err`
			`}`

			`// check for article type`
			`tag = doc.Find("meta[property='og:type']")`
			`pagetype, exists = tag.Attr("content")`
			`if !exists \|\| pagetype != "article" {`
			`return errors.New("unable to extract article, not of type article")`
			`}`

			`// get title`
			`tag = doc.Find("meta[property='og:title']")`
			`title, exists = tag.Attr("content")`
			`if !exists {`
			`return errors.New("unable to extract article, no title tag")`
			`}`

			`// prepend description to content of article`
			`tag = doc.Find("meta[name='description']")`
			`content, exists = tag.Attr("content")`
			`content += " "`
			`if !exists {`
			`return errors.New("unable to extract article, no description tag")`
			`}`

			`// get publishing date`
			`tag = doc.Find("meta[name='date']")`
			`datestr, exists = tag.Attr("content")`
			`if !exists {`
			`return errors.New("unable to extract article, no date tag")`
			`}`

			`date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)`
			`if err != nil {`
			`return err`
			`}`

			`// get author`
			`tag = doc.Find("meta[name='author']")`
			`author, exists = tag.Attr("content")`
			`if !exists {`
			`return errors.New("unable to extract article, no author tag")`
			`}`

			`// get content`
			`tag = doc.Find("main[id='Inhalt'] div > p")`

			`tag.Each(func(index int, p *goquery.Selection) {`
			`content += " " + p.Text()`
			`})`

			`// clean up content string`
			`content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))`
			`content = strings.ReplaceAll(content, "»", "\"")`
			`content = strings.ReplaceAll(content, "«", "\"")`

			`// insert new article`
			`article := model.Article{`
			`SourceUrl: url,`
			`PublishDate: date,`
			`FetchDate: time.Now(),`
			`Title: title,`
			`Content: content,`
			`Author: author,`
			`}`

			`err = c.Articles.Insert(&article)`
			`return err`
			`}`