package collectors import ( "crowsnest/internal/model" "errors" "fmt" "log" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly/v2" ) func (c *Collector) SpiegelCollect() { collycollector := colly.NewCollector( colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./persistence/spiegel_cache"), colly.MaxDepth(5), ) // cache collycollector.OnResponse(func(r *colly.Response) { url := r.Request.URL.String() err := c.SpiegelExtract(url, r.Body) if err == nil { log.Println("added article", url) } else { log.Println("failed to add article:", err, "("+url+")") } }) // cascade collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { url := e.Attr("href") if !strings.HasPrefix(url, "http") { return } e.Request.Visit(url) }) // go through archive startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) currentDate := time.Now() for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) { urlDate := date.Format("02.01.2006") url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate) collycollector.Visit(url) } } func (c *Collector) SpiegelExtract(url string, body []byte) error { paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`) url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`) whitespace := regexp.MustCompile(`\s+`) var exists bool var pagetype, title, content, datestr, author string var tag *goquery.Selection var date time.Time // check url url pattern if !url_pattern.Match([]byte(url)) { return errors.New("invalid url pattern") } // check for paywall if paywall_pattern.Match(body) { return errors.New("unable to extract article due to paywal") } // construct goquery doc doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body))) if err != nil { return err } // check for article type tag = doc.Find("meta[property='og:type']") pagetype, exists = tag.Attr("content") if !exists || pagetype != "article" { return errors.New("unable to extract article, not of type article") } // get title tag = doc.Find("meta[property='og:title']") title, exists = tag.Attr("content") if !exists { return errors.New("unable to extract article, no title tag") } // prepend description to content of article tag = doc.Find("meta[name='description']") content, exists = tag.Attr("content") content += " " if !exists { return errors.New("unable to extract article, no description tag") } // get publishing date tag = doc.Find("meta[name='date']") datestr, exists = tag.Attr("content") if !exists { return errors.New("unable to extract article, no date tag") } date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) if err != nil { return err } // get author tag = doc.Find("meta[name='author']") author, exists = tag.Attr("content") if !exists { return errors.New("unable to extract article, no author tag") } // get content tag = doc.Find("main[id='Inhalt'] div > p") tag.Each(func(index int, p *goquery.Selection) { content += " " + p.Text() }) // clean up content string content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) content = strings.ReplaceAll(content, "»", "\"") content = strings.ReplaceAll(content, "«", "\"") // insert new article article := model.Article{ SourceUrl: url, PublishDate: date, FetchDate: time.Now(), Title: title, Content: content, Author: author, } err = c.Articles.Insert(&article) return err }