114 lines
3.4 KiB
Go
114 lines
3.4 KiB
Go
|
|
package extractors
|
||
|
|
|
||
|
|
import (
|
||
|
|
"crowsnest/internal/model"
|
||
|
|
"log"
|
||
|
|
"regexp"
|
||
|
|
"strings"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"github.com/PuerkitoBio/goquery"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
func (extractor *Extractor) Spiegel() error {
|
||
|
|
// get urls to process
|
||
|
|
urls, err := extractor.Responses.UnprocessedUrls()
|
||
|
|
if err != nil { return err }
|
||
|
|
|
||
|
|
paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
|
||
|
|
url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
|
||
|
|
whitespace := regexp.MustCompile("\\s+")
|
||
|
|
|
||
|
|
var exists bool
|
||
|
|
var pagetype, title, content, datestr, author string
|
||
|
|
var tag *goquery.Selection
|
||
|
|
var date time.Time
|
||
|
|
|
||
|
|
for _, url := range urls {
|
||
|
|
// check url url pattern
|
||
|
|
if !url_pattern.Match([]byte(url)) { continue }
|
||
|
|
|
||
|
|
// get response
|
||
|
|
res, err := extractor.Responses.GetByUrl(url)
|
||
|
|
if err != nil {
|
||
|
|
log.Println("failed to process url", url, "with", err)
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
// check for paywall
|
||
|
|
if !paywall_false_pattern.Match([]byte(res.Content)) {
|
||
|
|
extractor.Responses.Processed(url)
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
// construct goquery doc
|
||
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content))
|
||
|
|
if err != nil {
|
||
|
|
log.Println("failed to process url", url, "with", err)
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
// check for article type
|
||
|
|
tag = doc.Find("meta[property='og:type']")
|
||
|
|
pagetype, exists = tag.Attr("content")
|
||
|
|
if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; }
|
||
|
|
|
||
|
|
// get title
|
||
|
|
tag = doc.Find("meta[property='og:title']")
|
||
|
|
title, exists = tag.Attr("content")
|
||
|
|
if !exists { extractor.Responses.Processed(url); continue; }
|
||
|
|
|
||
|
|
// prepend description to content of article
|
||
|
|
tag = doc.Find("meta[name='description']")
|
||
|
|
content, exists = tag.Attr("content")
|
||
|
|
content += " "
|
||
|
|
if !exists { extractor.Responses.Processed(url); continue; }
|
||
|
|
|
||
|
|
// get publishing date
|
||
|
|
tag = doc.Find("meta[name='date']")
|
||
|
|
datestr, exists = tag.Attr("content")
|
||
|
|
if !exists { extractor.Responses.Processed(url); continue; }
|
||
|
|
|
||
|
|
date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
|
||
|
|
if err != nil { extractor.Responses.Processed(url); continue; }
|
||
|
|
|
||
|
|
// get author
|
||
|
|
tag = doc.Find("meta[name='author']")
|
||
|
|
author, exists = tag.Attr("content")
|
||
|
|
if !exists { extractor.Responses.Processed(url); continue; }
|
||
|
|
|
||
|
|
// get content
|
||
|
|
tag = doc.Find("main[id='Inhalt'] div > p")
|
||
|
|
|
||
|
|
tag.Each(func(index int, p *goquery.Selection) {
|
||
|
|
content += " " + p.Text()
|
||
|
|
})
|
||
|
|
|
||
|
|
// clean up content string
|
||
|
|
content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
|
||
|
|
content = strings.ReplaceAll(content, "»", "\"")
|
||
|
|
content = strings.ReplaceAll(content, "«", "\"")
|
||
|
|
|
||
|
|
// insert new article
|
||
|
|
article := model.Article{
|
||
|
|
SourceUrl: url,
|
||
|
|
PublishDate: date,
|
||
|
|
FetchDate: res.FetchDate,
|
||
|
|
Title: title,
|
||
|
|
Content: content,
|
||
|
|
Author: author,
|
||
|
|
}
|
||
|
|
|
||
|
|
err = extractor.Articles.Insert(&article)
|
||
|
|
if err != nil {
|
||
|
|
log.Println("failed to insert", article)
|
||
|
|
} else {
|
||
|
|
extractor.Responses.Processed(url)
|
||
|
|
log.Println("found article at", url)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return nil
|
||
|
|
}
|