change db from sqlite3 to postgresql
This commit is contained in:
@@ -10,104 +10,125 @@ import (
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
|
||||
func (extractor *Extractor) Spiegel() error {
|
||||
// get urls to process
|
||||
urls, err := extractor.Responses.UnprocessedUrls()
|
||||
if err != nil { return err }
|
||||
// get urls to process
|
||||
urls, err := extractor.Responses.UnprocessedUrls()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
|
||||
url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
|
||||
whitespace := regexp.MustCompile("\\s+")
|
||||
url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
|
||||
whitespace := regexp.MustCompile("\\s+")
|
||||
|
||||
var exists bool
|
||||
var pagetype, title, content, datestr, author string
|
||||
var tag *goquery.Selection
|
||||
var date time.Time
|
||||
var exists bool
|
||||
var pagetype, title, content, datestr, author string
|
||||
var tag *goquery.Selection
|
||||
var date time.Time
|
||||
|
||||
for _, url := range urls {
|
||||
// check url url pattern
|
||||
if !url_pattern.Match([]byte(url)) { continue }
|
||||
for _, url := range urls {
|
||||
// check url url pattern
|
||||
if !url_pattern.Match([]byte(url)) {
|
||||
continue
|
||||
}
|
||||
|
||||
// get response
|
||||
res, err := extractor.Responses.GetByUrl(url)
|
||||
if err != nil {
|
||||
log.Println("failed to process url", url, "with", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// check for paywall
|
||||
if !paywall_false_pattern.Match([]byte(res.Content)) {
|
||||
extractor.Responses.Processed(url)
|
||||
continue
|
||||
}
|
||||
|
||||
// construct goquery doc
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content))
|
||||
if err != nil {
|
||||
log.Println("failed to process url", url, "with", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// check for article type
|
||||
tag = doc.Find("meta[property='og:type']")
|
||||
pagetype, exists = tag.Attr("content")
|
||||
if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; }
|
||||
// get response
|
||||
res, err := extractor.Responses.GetByUrl(url)
|
||||
if err != nil {
|
||||
log.Println("failed to process url", url, "with", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// get title
|
||||
tag = doc.Find("meta[property='og:title']")
|
||||
title, exists = tag.Attr("content")
|
||||
if !exists { extractor.Responses.Processed(url); continue; }
|
||||
// check for paywall
|
||||
if !paywall_false_pattern.Match([]byte(res.Content)) {
|
||||
extractor.Responses.Processed(url)
|
||||
continue
|
||||
}
|
||||
|
||||
// prepend description to content of article
|
||||
tag = doc.Find("meta[name='description']")
|
||||
content, exists = tag.Attr("content")
|
||||
content += " "
|
||||
if !exists { extractor.Responses.Processed(url); continue; }
|
||||
// construct goquery doc
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(res.Content)))
|
||||
if err != nil {
|
||||
log.Println("failed to process url", url, "with", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// get publishing date
|
||||
tag = doc.Find("meta[name='date']")
|
||||
datestr, exists = tag.Attr("content")
|
||||
if !exists { extractor.Responses.Processed(url); continue; }
|
||||
// check for article type
|
||||
tag = doc.Find("meta[property='og:type']")
|
||||
pagetype, exists = tag.Attr("content")
|
||||
if !exists || pagetype != "article" {
|
||||
extractor.Responses.Processed(url)
|
||||
continue
|
||||
}
|
||||
|
||||
// get title
|
||||
tag = doc.Find("meta[property='og:title']")
|
||||
title, exists = tag.Attr("content")
|
||||
if !exists {
|
||||
extractor.Responses.Processed(url)
|
||||
continue
|
||||
}
|
||||
|
||||
// prepend description to content of article
|
||||
tag = doc.Find("meta[name='description']")
|
||||
content, exists = tag.Attr("content")
|
||||
content += " "
|
||||
if !exists {
|
||||
extractor.Responses.Processed(url)
|
||||
continue
|
||||
}
|
||||
|
||||
// get publishing date
|
||||
tag = doc.Find("meta[name='date']")
|
||||
datestr, exists = tag.Attr("content")
|
||||
if !exists {
|
||||
extractor.Responses.Processed(url)
|
||||
continue
|
||||
}
|
||||
|
||||
date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
|
||||
if err != nil { extractor.Responses.Processed(url); continue; }
|
||||
if err != nil {
|
||||
extractor.Responses.Processed(url)
|
||||
continue
|
||||
}
|
||||
|
||||
// get author
|
||||
tag = doc.Find("meta[name='author']")
|
||||
author, exists = tag.Attr("content")
|
||||
if !exists { extractor.Responses.Processed(url); continue; }
|
||||
// get author
|
||||
tag = doc.Find("meta[name='author']")
|
||||
author, exists = tag.Attr("content")
|
||||
if !exists {
|
||||
extractor.Responses.Processed(url)
|
||||
continue
|
||||
}
|
||||
|
||||
// get content
|
||||
tag = doc.Find("main[id='Inhalt'] div > p")
|
||||
// get content
|
||||
tag = doc.Find("main[id='Inhalt'] div > p")
|
||||
|
||||
tag.Each(func(index int, p *goquery.Selection) {
|
||||
content += " " + p.Text()
|
||||
})
|
||||
|
||||
// clean up content string
|
||||
content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
|
||||
content = strings.ReplaceAll(content, "»", "\"")
|
||||
content = strings.ReplaceAll(content, "«", "\"")
|
||||
|
||||
// insert new article
|
||||
article := model.Article{
|
||||
SourceUrl: url,
|
||||
PublishDate: date,
|
||||
FetchDate: res.FetchDate,
|
||||
Title: title,
|
||||
Content: content,
|
||||
Author: author,
|
||||
}
|
||||
|
||||
err = extractor.Articles.Insert(&article)
|
||||
if err != nil {
|
||||
log.Println("failed to insert", article)
|
||||
} else {
|
||||
extractor.Responses.Processed(url)
|
||||
log.Println("found article at", url)
|
||||
}
|
||||
}
|
||||
tag.Each(func(index int, p *goquery.Selection) {
|
||||
content += " " + p.Text()
|
||||
})
|
||||
|
||||
return nil
|
||||
// clean up content string
|
||||
content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
|
||||
content = strings.ReplaceAll(content, "»", "\"")
|
||||
content = strings.ReplaceAll(content, "«", "\"")
|
||||
|
||||
// insert new article
|
||||
article := model.Article{
|
||||
SourceUrl: url,
|
||||
PublishDate: date,
|
||||
FetchDate: res.FetchDate,
|
||||
Title: title,
|
||||
Content: content,
|
||||
Author: author,
|
||||
}
|
||||
|
||||
err = extractor.Articles.Insert(&article)
|
||||
if err != nil {
|
||||
log.Println("failed to insert", article)
|
||||
} else {
|
||||
extractor.Responses.Processed(url)
|
||||
log.Println("found article at", url)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user