cmd/crawler/collectors/spiegel.go

package collectors

import (
	//"crowsnest/internal/model"
	//"regexp"
	//"time"
	//"strings"

	"fmt"
	"time"

	"github.com/gocolly/colly/v2"
)


func (c *Collector) Spiegel() {
	collycollector := colly.NewCollector(
		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
		colly.CacheDir("./persistence/spiegel_cache"),
		colly.MaxDepth(3),
	)

	// cascade
	collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
		e.Request.Visit(e.Attr("href"))
	})

    // cache
	collycollector.OnScraped(func(r *colly.Response) {
        c.Responses.Insert(r.Request.URL.String(), string(r.Body))
	})

    // go through archive
	startDate := time.Date(2000, time.January, 1, 0, 0, 0, 0, time.UTC)
	currentDate := time.Now()

	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) {
		urlDate := date.Format("02.01.2006")
		url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate)
		
        collycollector.Visit(url)
	}

	//// create entry if not behind paywall
	//paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
	//collycollector.OnResponse(func(r *colly.Response) {
	//	if paywall_false_pattern.Match(r.Body) {
	//		url := r.Request.URL.String()
	//		(*results)[url] = &model.Article{
	//			SourceUrl: url,
	//			FetchDate: time.Now(),
	//			Content:   "",
	//		}
	//	}

	//})

	//// check for article type
	//collycollector.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) {
	//	if e.Attr("content") != "article" {
    //        delete(*results, e.Request.URL.String())
	//	} 
	//})

	//// add title
	//collycollector.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) {
	//	if val, ok := (*results)[e.Request.URL.String()]; ok {
	//		val.Title = e.Attr("content")
	//	}
	//})

	//// prepend description to content of article
	//collycollector.OnHTML("meta[name='description']", func(e *colly.HTMLElement) {
	//	if val, ok := (*results)[e.Request.URL.String()]; ok {
	//		val.Content = e.Attr("content") + val.Content
	//	}
	//})

    //// add publishing date
	//collycollector.OnHTML("meta[name='date']", func(e *colly.HTMLElement) {
	//	if val, ok := (*results)[e.Request.URL.String()]; ok {
	//		t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content"))
	//		if err != nil {
	//			panic(err)
	//		}
	//		val.PublishDate = t
	//	}
	//})

	//// add author
	//collycollector.OnHTML("meta[name='author']", func(e *colly.HTMLElement) {
	//    if val, ok := (*results)[e.Request.URL.String()]; ok {
	//        val.Author = e.Attr("content")
	//    }
	//})

    //// add content
	//collycollector.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) {
	//	if val, ok := (*results)[e.Request.URL.String()]; ok {
    //        cont := val.Content

    //        pattern := regexp.MustCompile("\\s+")
    //        cont = string(pattern.ReplaceAll([]byte(cont), []byte(" ")))
    //        cont = strings.ReplaceAll(cont, "»", "\"")
    //        cont = strings.ReplaceAll(cont, "«", "\"")
	//		val.Content = cont + " " + e.Text
	//	}
	//})

}
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`package collectors`

			`import (`
			`//"crowsnest/internal/model"`
			`//"regexp"`
			`//"time"`
			`//"strings"`

			`"fmt"`
			`"time"`

			`"github.com/gocolly/colly/v2"`
			`)`


			`func (c *Collector) Spiegel() {`
			`collycollector := colly.NewCollector(`
			`colly.AllowedDomains("www.spiegel.de", "spiegel.de"),`
			`colly.CacheDir("./persistence/spiegel_cache"),`
			`colly.MaxDepth(3),`
			`)`

			`// cascade`
			`collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {`
			`e.Request.Visit(e.Attr("href"))`
			`})`

			`// cache`
			`collycollector.OnScraped(func(r *colly.Response) {`
			`c.Responses.Insert(r.Request.URL.String(), string(r.Body))`
			`})`

			`// go through archive`
			`startDate := time.Date(2000, time.January, 1, 0, 0, 0, 0, time.UTC)`
			`currentDate := time.Now()`

			`for date := startDate; date.Before(currentDate) \|\| date.Equal(currentDate); date = date.AddDate(0, 0, 1) {`
			`urlDate := date.Format("02.01.2006")`
			`url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate)`

			`collycollector.Visit(url)`
			`}`

			`//// create entry if not behind paywall`
			`//paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")`
			`//collycollector.OnResponse(func(r *colly.Response) {`
			`// if paywall_false_pattern.Match(r.Body) {`
			`// url := r.Request.URL.String()`
			`// (*results)[url] = &model.Article{`
			`// SourceUrl: url,`
			`// FetchDate: time.Now(),`
			`// Content: "",`
			`// }`
			`// }`

			`//})`

			`//// check for article type`
			`//collycollector.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) {`
			`// if e.Attr("content") != "article" {`
			`// delete(*results, e.Request.URL.String())`
			`// }`
			`//})`

			`//// add title`
			`//collycollector.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) {`
			`// if val, ok := (*results)[e.Request.URL.String()]; ok {`
			`// val.Title = e.Attr("content")`
			`// }`
			`//})`

			`//// prepend description to content of article`
			`//collycollector.OnHTML("meta[name='description']", func(e *colly.HTMLElement) {`
			`// if val, ok := (*results)[e.Request.URL.String()]; ok {`
			`// val.Content = e.Attr("content") + val.Content`
			`// }`
			`//})`

			`//// add publishing date`
			`//collycollector.OnHTML("meta[name='date']", func(e *colly.HTMLElement) {`
			`// if val, ok := (*results)[e.Request.URL.String()]; ok {`
			`// t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content"))`
			`// if err != nil {`
			`// panic(err)`
			`// }`
			`// val.PublishDate = t`
			`// }`
			`//})`

			`//// add author`
			`//collycollector.OnHTML("meta[name='author']", func(e *colly.HTMLElement) {`
			`// if val, ok := (*results)[e.Request.URL.String()]; ok {`
			`// val.Author = e.Attr("content")`
			`// }`
			`//})`

			`//// add content`
			`//collycollector.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) {`
			`// if val, ok := (*results)[e.Request.URL.String()]; ok {`
			`// cont := val.Content`

			`// pattern := regexp.MustCompile("\\s+")`
			`// cont = string(pattern.ReplaceAll([]byte(cont), []byte(" ")))`
			`// cont = strings.ReplaceAll(cont, "»", "\"")`
			`// cont = strings.ReplaceAll(cont, "«", "\"")`
			`// val.Content = cont + " " + e.Text`
			`// }`
			`//})`

			`}`