move collection and extraction of articles into their own file; add custom response cache
This commit is contained in:
39
cmd/crawler/collectors/zeit.go
Normal file
39
cmd/crawler/collectors/zeit.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
)
|
||||
|
||||
// Gets every page of the archive of zeit.de and stores the responses into the
|
||||
// database.
|
||||
func (c *Collector) Zeit() {
|
||||
collycollector := colly.NewCollector(
|
||||
colly.AllowedDomains("www.zeit.de", "zeit.de"),
|
||||
colly.CacheDir("./persistence/zeit_cache"),
|
||||
colly.MaxDepth(2),
|
||||
)
|
||||
|
||||
// cascade
|
||||
collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
e.Request.Visit(e.Attr("href"))
|
||||
})
|
||||
|
||||
// cache
|
||||
collycollector.OnScraped(func(r *colly.Response) {
|
||||
c.Responses.Insert(r.Request.URL.String(), string(r.Body))
|
||||
})
|
||||
|
||||
// go through archive
|
||||
startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
|
||||
currentDate := time.Now()
|
||||
|
||||
for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) {
|
||||
year, week := date.ISOWeek()
|
||||
url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week)
|
||||
|
||||
collycollector.Visit(url)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user