40 lines
967 B
Go
40 lines
967 B
Go
package collectors
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/gocolly/colly/v2"
|
|
)
|
|
|
|
// Gets every page of the archive of zeit.de and stores the responses into the
|
|
// database.
|
|
func (c *Collector) Zeit() {
|
|
collycollector := colly.NewCollector(
|
|
colly.AllowedDomains("www.zeit.de", "zeit.de"),
|
|
colly.CacheDir("./persistence/zeit_cache"),
|
|
colly.MaxDepth(2),
|
|
)
|
|
|
|
// cascade
|
|
collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
|
e.Request.Visit(e.Attr("href"))
|
|
})
|
|
|
|
// cache
|
|
collycollector.OnScraped(func(r *colly.Response) {
|
|
c.Responses.Insert(r.Request.URL.String(), string(r.Body))
|
|
})
|
|
|
|
// go through archive
|
|
startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
|
|
currentDate := time.Now()
|
|
|
|
for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) {
|
|
year, week := date.ISOWeek()
|
|
url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week)
|
|
|
|
collycollector.Visit(url)
|
|
}
|
|
}
|