Files
crowsnest/cmd/crawler/collectors/zeit.go

40 lines
967 B
Go

package collectors
import (
"fmt"
"time"
"github.com/gocolly/colly/v2"
)
// Gets every page of the archive of zeit.de and stores the responses into the
// database.
func (c *Collector) Zeit() {
collycollector := colly.NewCollector(
colly.AllowedDomains("www.zeit.de", "zeit.de"),
colly.CacheDir("./persistence/zeit_cache"),
colly.MaxDepth(2),
)
// cascade
collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
// cache
collycollector.OnScraped(func(r *colly.Response) {
c.Responses.Insert(r.Request.URL.String(), string(r.Body))
})
// go through archive
startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
currentDate := time.Now()
for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) {
year, week := date.ISOWeek()
url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week)
collycollector.Visit(url)
}
}