cmd/crawler/extractors/spiegel.go

package extractors

import (
	"crowsnest/internal/model"
	"log"
	"regexp"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
)


func (extractor *Extractor) Spiegel() error {
    // get urls to process
    urls, err := extractor.Responses.UnprocessedUrls()
    if err != nil { return err }

	paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
    url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
    whitespace := regexp.MustCompile("\\s+")

    var exists bool
    var pagetype, title, content, datestr, author string
    var tag *goquery.Selection
    var date time.Time

    for _, url := range urls {
        // check url url pattern
	    if !url_pattern.Match([]byte(url)) { continue }

        // get response
        res, err := extractor.Responses.GetByUrl(url)
        if err != nil {
            log.Println("failed to process url", url, "with", err)
            continue
        }
        
        // check for paywall
	    if !paywall_false_pattern.Match([]byte(res.Content)) {
            extractor.Responses.Processed(url)
            continue
	    }
        
        // construct goquery doc
        doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content))
        if err != nil {
            log.Println("failed to process url", url, "with", err)
            continue
        }
        
        // check for article type
        tag = doc.Find("meta[property='og:type']")
        pagetype, exists = tag.Attr("content")
        if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; }

	    // get title
        tag = doc.Find("meta[property='og:title']")
        title, exists = tag.Attr("content")
        if !exists { extractor.Responses.Processed(url); continue; }

	    // prepend description to content of article
        tag = doc.Find("meta[name='description']")
        content, exists = tag.Attr("content")
        content += " "
        if !exists { extractor.Responses.Processed(url); continue; }

        // get publishing date
        tag = doc.Find("meta[name='date']")
        datestr, exists = tag.Attr("content")
        if !exists { extractor.Responses.Processed(url); continue; }

		date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
        if err != nil { extractor.Responses.Processed(url); continue; }

	    // get author
        tag = doc.Find("meta[name='author']")
        author, exists = tag.Attr("content")
        if !exists { extractor.Responses.Processed(url); continue; }

        // get content
        tag = doc.Find("main[id='Inhalt'] div > p")

        tag.Each(func(index int, p *goquery.Selection) {
            content += " " + p.Text()
        })
        
        // clean up content string
        content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
        content = strings.ReplaceAll(content, "»", "\"")
        content = strings.ReplaceAll(content, "«", "\"")
        
        // insert new article
        article := model.Article{
            SourceUrl: url,
            PublishDate: date,
            FetchDate: res.FetchDate,
            Title: title,
            Content: content,
            Author: author,
        }
        
        err = extractor.Articles.Insert(&article)
        if err != nil {
            log.Println("failed to insert", article)
        } else {
            extractor.Responses.Processed(url)
            log.Println("found article at", url)
        }
    }

    return nil
}
move collection and extraction of articles into their own file; add custom response cache 2025-01-04 16:53:46 +01:00			`package extractors`

			`import (`
			`"crowsnest/internal/model"`
			`"log"`
			`"regexp"`
			`"strings"`
			`"time"`

			`"github.com/PuerkitoBio/goquery"`
			`)`


			`func (extractor *Extractor) Spiegel() error {`
			`// get urls to process`
			`urls, err := extractor.Responses.UnprocessedUrls()`
			`if err != nil { return err }`

			`paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")`
			`url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")`
			`whitespace := regexp.MustCompile("\\s+")`

			`var exists bool`
			`var pagetype, title, content, datestr, author string`
			`var tag *goquery.Selection`
			`var date time.Time`

			`for _, url := range urls {`
			`// check url url pattern`
			`if !url_pattern.Match([]byte(url)) { continue }`

			`// get response`
			`res, err := extractor.Responses.GetByUrl(url)`
			`if err != nil {`
			`log.Println("failed to process url", url, "with", err)`
			`continue`
			`}`

			`// check for paywall`
			`if !paywall_false_pattern.Match([]byte(res.Content)) {`
			`extractor.Responses.Processed(url)`
			`continue`
			`}`

			`// construct goquery doc`
			`doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content))`
			`if err != nil {`
			`log.Println("failed to process url", url, "with", err)`
			`continue`
			`}`

			`// check for article type`
			`tag = doc.Find("meta[property='og:type']")`
			`pagetype, exists = tag.Attr("content")`
			`if !exists \|\| pagetype != "article" { extractor.Responses.Processed(url); continue; }`

			`// get title`
			`tag = doc.Find("meta[property='og:title']")`
			`title, exists = tag.Attr("content")`
			`if !exists { extractor.Responses.Processed(url); continue; }`

			`// prepend description to content of article`
			`tag = doc.Find("meta[name='description']")`
			`content, exists = tag.Attr("content")`
			`content += " "`
			`if !exists { extractor.Responses.Processed(url); continue; }`

			`// get publishing date`
			`tag = doc.Find("meta[name='date']")`
			`datestr, exists = tag.Attr("content")`
			`if !exists { extractor.Responses.Processed(url); continue; }`

			`date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)`
			`if err != nil { extractor.Responses.Processed(url); continue; }`

			`// get author`
			`tag = doc.Find("meta[name='author']")`
			`author, exists = tag.Attr("content")`
			`if !exists { extractor.Responses.Processed(url); continue; }`

			`// get content`
			`tag = doc.Find("main[id='Inhalt'] div > p")`

			`tag.Each(func(index int, p *goquery.Selection) {`
			`content += " " + p.Text()`
			`})`

			`// clean up content string`
			`content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))`
			`content = strings.ReplaceAll(content, "»", "\"")`
			`content = strings.ReplaceAll(content, "«", "\"")`

			`// insert new article`
			`article := model.Article{`
			`SourceUrl: url,`
			`PublishDate: date,`
			`FetchDate: res.FetchDate,`
			`Title: title,`
			`Content: content,`
			`Author: author,`
			`}`

			`err = extractor.Articles.Insert(&article)`
			`if err != nil {`
			`log.Println("failed to insert", article)`
			`} else {`
			`extractor.Responses.Processed(url)`
			`log.Println("found article at", url)`
			`}`
			`}`

			`return nil`
			`}`