add zeit.de collector

This commit is contained in:
2025-01-07 11:41:17 +01:00
parent fb257666aa
commit b16ebb9572
2 changed files with 120 additions and 12 deletions

View File

@@ -13,17 +13,17 @@ import (
"github.com/gocolly/colly/v2"
)
func (c *Collector) SpiegelCollect() {
func (c *Collector) CollectSpiegel() {
collycollector := colly.NewCollector(
colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
colly.CacheDir("./persistence/spiegel_cache"),
colly.MaxDepth(5),
)
// cache
// store articles
collycollector.OnResponse(func(r *colly.Response) {
url := r.Request.URL.String()
err := c.SpiegelExtract(url, r.Body)
err := c.ExtractSpiegel(url, r.Body)
if err == nil {
log.Println("added article", url)
} else {
@@ -53,7 +53,7 @@ func (c *Collector) SpiegelCollect() {
}
}
func (c *Collector) SpiegelExtract(url string, body []byte) error {
func (c *Collector) ExtractSpiegel(url string, body []byte) error {
paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
whitespace := regexp.MustCompile(`\s+`)