crowsnest/cmd/crawler/extractors/spiegel.go

package extractors

import (
	"crowsnest/internal/model"
	"log"
	"regexp"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
)


func (extractor *Extractor) Spiegel() error {
    // get urls to process
    urls, err := extractor.Responses.UnprocessedUrls()
    if err != nil { return err }

	paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
    url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
    whitespace := regexp.MustCompile("\\s+")

    var exists bool
    var pagetype, title, content, datestr, author string
    var tag *goquery.Selection
    var date time.Time

    for _, url := range urls {
        // check url url pattern
	    if !url_pattern.Match([]byte(url)) { continue }

        // get response
        res, err := extractor.Responses.GetByUrl(url)
        if err != nil {
            log.Println("failed to process url", url, "with", err)
            continue
        }

        // check for paywall
	    if !paywall_false_pattern.Match([]byte(res.Content)) {
            extractor.Responses.Processed(url)
            continue
	    }

        // construct goquery doc
        doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content))
        if err != nil {
            log.Println("failed to process url", url, "with", err)
            continue
        }

        // check for article type
        tag = doc.Find("meta[property='og:type']")
        pagetype, exists = tag.Attr("content")
        if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; }

	    // get title
        tag = doc.Find("meta[property='og:title']")
        title, exists = tag.Attr("content")
        if !exists { extractor.Responses.Processed(url); continue; }

	    // prepend description to content of article
        tag = doc.Find("meta[name='description']")
        content, exists = tag.Attr("content")
        content += " "
        if !exists { extractor.Responses.Processed(url); continue; }

        // get publishing date
        tag = doc.Find("meta[name='date']")
        datestr, exists = tag.Attr("content")
        if !exists { extractor.Responses.Processed(url); continue; }

		date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
        if err != nil { extractor.Responses.Processed(url); continue; }

	    // get author
        tag = doc.Find("meta[name='author']")
        author, exists = tag.Attr("content")
        if !exists { extractor.Responses.Processed(url); continue; }

        // get content
        tag = doc.Find("main[id='Inhalt'] div > p")

        tag.Each(func(index int, p *goquery.Selection) {
            content += " " + p.Text()
        })

        // clean up content string
        content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
        content = strings.ReplaceAll(content, "»", "\"")
        content = strings.ReplaceAll(content, "«", "\"")

        // insert new article
        article := model.Article{
            SourceUrl: url,
            PublishDate: date,
            FetchDate: res.FetchDate,
            Title: title,
            Content: content,
            Author: author,
        }

        err = extractor.Articles.Insert(&article)
        if err != nil {
            log.Println("failed to insert", article)
        } else {
            extractor.Responses.Processed(url)
            log.Println("found article at", url)
        }
    }

    return nil
}