move collection and extraction of articles into their own file; add custom response cache

2025-01-04 16:53:46 +01:00
parent f31c1d7793
commit d63ee8dcf2
14 changed files with 519 additions and 105 deletions
--- a/cmd/crawler/collectors/zeit.go
+++ b/cmd/crawler/collectors/zeit.go
@@ -0,0 +1,39 @@
+package collectors
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/gocolly/colly/v2"
+)
+
+// Gets every page of the archive of zeit.de and stores the responses into the
+// database.
+func (c *Collector) Zeit() {
+	collycollector := colly.NewCollector(
+		colly.AllowedDomains("www.zeit.de", "zeit.de"),
+		colly.CacheDir("./persistence/zeit_cache"),
+		colly.MaxDepth(2),
+	)
+
+	// cascade
+	collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
+		e.Request.Visit(e.Attr("href"))
+	})
+
+	// cache
+	collycollector.OnScraped(func(r *colly.Response) {
+		c.Responses.Insert(r.Request.URL.String(), string(r.Body))
+	})
+
+	// go through archive
+	startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
+	currentDate := time.Now()
+
+	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) {
+		year, week := date.ISOWeek()
+		url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week)
+
+		collycollector.Visit(url)
+	}
+}