diff --git a/src/internal/crawler/ZeitConverter.go b/src/internal/crawler/ZeitConverter.go deleted file mode 100644 index 3fe7845..0000000 --- a/src/internal/crawler/ZeitConverter.go +++ /dev/null @@ -1,100 +0,0 @@ -package crawler - -import ( - "crowsnest/internal/model" - "errors" - "regexp" - "strings" - "time" - - "github.com/PuerkitoBio/goquery" -) - -type ZeitConverter struct { - pattern_url *regexp.Regexp - pattern_whitespace *regexp.Regexp -} - -func (c *ZeitConverter) Init() { - c.pattern_url = regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`) - c.pattern_whitespace = regexp.MustCompile(`\s+`) -} - -func (c *ZeitConverter) Convert(res *Resource) (*model.Article, error) { - // check url url pattern - if !c.pattern_url.Match([]byte(res.Url)) { - return nil, errors.New("invalid url pattern") - } - - // construct goquery doc - doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body)) - if err != nil { - return nil, err - } - - // check for article type - tag := doc.Find("meta[property='og:type']") - pagetype, exists := tag.Attr("content") - if !exists || pagetype != "article" { - return nil, errors.New("unable to extract article, not of type article") - } - - // check for paywall - tag = doc.Find("meta[property='article:content_tier']") - pagetype, exists = tag.Attr("content") - if !exists || pagetype != "free" { - return nil, errors.New("unable to extract article due to paywal") - } - - // get title - tag = doc.Find("meta[property='og:title']") - title, exists := tag.Attr("content") - if !exists { - return nil, errors.New("unable to extract article, no title tag") - } - - // prepend description to content of article - tag = doc.Find("meta[name='description']") - content, exists := tag.Attr("content") - content += " " - if !exists { - return nil, errors.New("unable to extract article, no description tag") - } - - if strings.Contains(content, "Das Liveblog") { - return nil, errors.New("unable to extract article, no support for liveblog") - } - - // get publishing date - tag = doc.Find("meta[name='date']") - datestr, exists := tag.Attr("content") - if !exists { - return nil, errors.New("unable to extract article, no date tag") - } - - date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr) - if err != nil { - return nil, err - } - - // get content - tag = doc.Find("main > article > div.article-body p.article__item") - - tag.Each(func(index int, p *goquery.Selection) { - content += " " + p.Text() - }) - - // clean up content string - content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" "))) - content = strings.ReplaceAll(content, "»", "\"") - content = strings.ReplaceAll(content, "«", "\"") - - // create new article - return &model.Article{ - SourceUrl: res.Url, - PublishDate: date, - FetchDate: time.Now(), - Title: title, - Content: content, - }, nil -}