package collectors import ( //"crowsnest/internal/model" //"regexp" //"time" //"strings" "fmt" "time" "github.com/gocolly/colly/v2" ) func (c *Collector) Spiegel() { collycollector := colly.NewCollector( colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./persistence/spiegel_cache"), colly.MaxDepth(3), ) // cascade collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) }) // cache collycollector.OnScraped(func(r *colly.Response) { c.Responses.Insert(r.Request.URL.String(), string(r.Body)) }) // go through archive startDate := time.Date(2000, time.January, 1, 0, 0, 0, 0, time.UTC) currentDate := time.Now() for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) { urlDate := date.Format("02.01.2006") url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate) collycollector.Visit(url) } //// create entry if not behind paywall //paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") //collycollector.OnResponse(func(r *colly.Response) { // if paywall_false_pattern.Match(r.Body) { // url := r.Request.URL.String() // (*results)[url] = &model.Article{ // SourceUrl: url, // FetchDate: time.Now(), // Content: "", // } // } //}) //// check for article type //collycollector.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) { // if e.Attr("content") != "article" { // delete(*results, e.Request.URL.String()) // } //}) //// add title //collycollector.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) { // if val, ok := (*results)[e.Request.URL.String()]; ok { // val.Title = e.Attr("content") // } //}) //// prepend description to content of article //collycollector.OnHTML("meta[name='description']", func(e *colly.HTMLElement) { // if val, ok := (*results)[e.Request.URL.String()]; ok { // val.Content = e.Attr("content") + val.Content // } //}) //// add publishing date //collycollector.OnHTML("meta[name='date']", func(e *colly.HTMLElement) { // if val, ok := (*results)[e.Request.URL.String()]; ok { // t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content")) // if err != nil { // panic(err) // } // val.PublishDate = t // } //}) //// add author //collycollector.OnHTML("meta[name='author']", func(e *colly.HTMLElement) { // if val, ok := (*results)[e.Request.URL.String()]; ok { // val.Author = e.Attr("content") // } //}) //// add content //collycollector.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) { // if val, ok := (*results)[e.Request.URL.String()]; ok { // cont := val.Content // pattern := regexp.MustCompile("\\s+") // cont = string(pattern.ReplaceAll([]byte(cont), []byte(" "))) // cont = strings.ReplaceAll(cont, "»", "\"") // cont = strings.ReplaceAll(cont, "«", "\"") // val.Content = cont + " " + e.Text // } //}) }