crowsnest/cmd/crawler/collectors/zeit.go

package collectors

import (
	"fmt"
	"time"

	"github.com/gocolly/colly/v2"
)

// Gets every page of the archive of zeit.de and stores the responses into the
// database.
func (c *Collector) Zeit() {
	collycollector := colly.NewCollector(
		colly.AllowedDomains("www.zeit.de", "zeit.de"),
		colly.CacheDir("./persistence/zeit_cache"),
		colly.MaxDepth(2),
	)

	// cascade
	collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
		e.Request.Visit(e.Attr("href"))
	})

	// cache
	collycollector.OnScraped(func(r *colly.Response) {
		c.Responses.Insert(r.Request.URL.String(), string(r.Body))
	})

	// go through archive
	startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
	currentDate := time.Now()

	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) {
		year, week := date.ISOWeek()
		url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week)

		collycollector.Visit(url)
	}
}