change db from sqlite3 to postgresql

This commit is contained in:
2025-01-07 09:32:57 +01:00
parent f719c73b46
commit 9302e982c6
20 changed files with 309 additions and 284 deletions

View File

@@ -10,104 +10,125 @@ import (
"github.com/PuerkitoBio/goquery"
)
func (extractor *Extractor) Spiegel() error {
// get urls to process
urls, err := extractor.Responses.UnprocessedUrls()
if err != nil { return err }
// get urls to process
urls, err := extractor.Responses.UnprocessedUrls()
if err != nil {
return err
}
paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
whitespace := regexp.MustCompile("\\s+")
url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
whitespace := regexp.MustCompile("\\s+")
var exists bool
var pagetype, title, content, datestr, author string
var tag *goquery.Selection
var date time.Time
var exists bool
var pagetype, title, content, datestr, author string
var tag *goquery.Selection
var date time.Time
for _, url := range urls {
// check url url pattern
if !url_pattern.Match([]byte(url)) { continue }
for _, url := range urls {
// check url url pattern
if !url_pattern.Match([]byte(url)) {
continue
}
// get response
res, err := extractor.Responses.GetByUrl(url)
if err != nil {
log.Println("failed to process url", url, "with", err)
continue
}
// check for paywall
if !paywall_false_pattern.Match([]byte(res.Content)) {
extractor.Responses.Processed(url)
continue
}
// construct goquery doc
doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content))
if err != nil {
log.Println("failed to process url", url, "with", err)
continue
}
// check for article type
tag = doc.Find("meta[property='og:type']")
pagetype, exists = tag.Attr("content")
if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; }
// get response
res, err := extractor.Responses.GetByUrl(url)
if err != nil {
log.Println("failed to process url", url, "with", err)
continue
}
// get title
tag = doc.Find("meta[property='og:title']")
title, exists = tag.Attr("content")
if !exists { extractor.Responses.Processed(url); continue; }
// check for paywall
if !paywall_false_pattern.Match([]byte(res.Content)) {
extractor.Responses.Processed(url)
continue
}
// prepend description to content of article
tag = doc.Find("meta[name='description']")
content, exists = tag.Attr("content")
content += " "
if !exists { extractor.Responses.Processed(url); continue; }
// construct goquery doc
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(res.Content)))
if err != nil {
log.Println("failed to process url", url, "with", err)
continue
}
// get publishing date
tag = doc.Find("meta[name='date']")
datestr, exists = tag.Attr("content")
if !exists { extractor.Responses.Processed(url); continue; }
// check for article type
tag = doc.Find("meta[property='og:type']")
pagetype, exists = tag.Attr("content")
if !exists || pagetype != "article" {
extractor.Responses.Processed(url)
continue
}
// get title
tag = doc.Find("meta[property='og:title']")
title, exists = tag.Attr("content")
if !exists {
extractor.Responses.Processed(url)
continue
}
// prepend description to content of article
tag = doc.Find("meta[name='description']")
content, exists = tag.Attr("content")
content += " "
if !exists {
extractor.Responses.Processed(url)
continue
}
// get publishing date
tag = doc.Find("meta[name='date']")
datestr, exists = tag.Attr("content")
if !exists {
extractor.Responses.Processed(url)
continue
}
date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
if err != nil { extractor.Responses.Processed(url); continue; }
if err != nil {
extractor.Responses.Processed(url)
continue
}
// get author
tag = doc.Find("meta[name='author']")
author, exists = tag.Attr("content")
if !exists { extractor.Responses.Processed(url); continue; }
// get author
tag = doc.Find("meta[name='author']")
author, exists = tag.Attr("content")
if !exists {
extractor.Responses.Processed(url)
continue
}
// get content
tag = doc.Find("main[id='Inhalt'] div > p")
// get content
tag = doc.Find("main[id='Inhalt'] div > p")
tag.Each(func(index int, p *goquery.Selection) {
content += " " + p.Text()
})
// clean up content string
content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
content = strings.ReplaceAll(content, "»", "\"")
content = strings.ReplaceAll(content, "«", "\"")
// insert new article
article := model.Article{
SourceUrl: url,
PublishDate: date,
FetchDate: res.FetchDate,
Title: title,
Content: content,
Author: author,
}
err = extractor.Articles.Insert(&article)
if err != nil {
log.Println("failed to insert", article)
} else {
extractor.Responses.Processed(url)
log.Println("found article at", url)
}
}
tag.Each(func(index int, p *goquery.Selection) {
content += " " + p.Text()
})
return nil
// clean up content string
content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
content = strings.ReplaceAll(content, "»", "\"")
content = strings.ReplaceAll(content, "«", "\"")
// insert new article
article := model.Article{
SourceUrl: url,
PublishDate: date,
FetchDate: res.FetchDate,
Title: title,
Content: content,
Author: author,
}
err = extractor.Articles.Insert(&article)
if err != nil {
log.Println("failed to insert", article)
} else {
extractor.Responses.Processed(url)
log.Println("found article at", url)
}
}
return nil
}