package collectors import ( "fmt" "time" "github.com/gocolly/colly/v2" ) // Gets every page of the archive of zeit.de and stores the responses into the // database. func (c *Collector) Zeit() { collycollector := colly.NewCollector( colly.AllowedDomains("www.zeit.de", "zeit.de"), colly.CacheDir("./persistence/zeit_cache"), colly.MaxDepth(2), ) // cascade collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) }) // cache collycollector.OnScraped(func(r *colly.Response) { c.Responses.Insert(r.Request.URL.String(), string(r.Body)) }) // go through archive startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC) currentDate := time.Now() for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) { year, week := date.ISOWeek() url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week) collycollector.Visit(url) } }