change db from sqlite3 to postgresql

2025-01-07 09:32:57 +01:00
parent f719c73b46
commit 9302e982c6
20 changed files with 309 additions and 284 deletions
--- a/cmd/crawler/extractors/spiegel.go
+++ b/cmd/crawler/extractors/spiegel.go
@@ -10,104 +10,125 @@ import (
 	"github.com/PuerkitoBio/goquery"
 )

-
 func (extractor *Extractor) Spiegel() error {
-    // get urls to process
-    urls, err := extractor.Responses.UnprocessedUrls()
-    if err != nil { return err }
+	// get urls to process
+	urls, err := extractor.Responses.UnprocessedUrls()
+	if err != nil {
+		return err
+	}

 	paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
-    url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
-    whitespace := regexp.MustCompile("\\s+")
+	url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
+	whitespace := regexp.MustCompile("\\s+")

-    var exists bool
-    var pagetype, title, content, datestr, author string
-    var tag *goquery.Selection
-    var date time.Time
+	var exists bool
+	var pagetype, title, content, datestr, author string
+	var tag *goquery.Selection
+	var date time.Time

-    for _, url := range urls {
-        // check url url pattern
-	    if !url_pattern.Match([]byte(url)) { continue }
+	for _, url := range urls {
+		// check url url pattern
+		if !url_pattern.Match([]byte(url)) {
+			continue
+		}

-        // get response
-        res, err := extractor.Responses.GetByUrl(url)
-        if err != nil {
-            log.Println("failed to process url", url, "with", err)
-            continue
-        }
-        
-        // check for paywall
-	    if !paywall_false_pattern.Match([]byte(res.Content)) {
-            extractor.Responses.Processed(url)
-            continue
-	    }
-        
-        // construct goquery doc
-        doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content))
-        if err != nil {
-            log.Println("failed to process url", url, "with", err)
-            continue
-        }
-        
-        // check for article type
-        tag = doc.Find("meta[property='og:type']")
-        pagetype, exists = tag.Attr("content")
-        if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; }
+		// get response
+		res, err := extractor.Responses.GetByUrl(url)
+		if err != nil {
+			log.Println("failed to process url", url, "with", err)
+			continue
+		}

-	    // get title
-        tag = doc.Find("meta[property='og:title']")
-        title, exists = tag.Attr("content")
-        if !exists { extractor.Responses.Processed(url); continue; }
+		// check for paywall
+		if !paywall_false_pattern.Match([]byte(res.Content)) {
+			extractor.Responses.Processed(url)
+			continue
+		}

-	    // prepend description to content of article
-        tag = doc.Find("meta[name='description']")
-        content, exists = tag.Attr("content")
-        content += " "
-        if !exists { extractor.Responses.Processed(url); continue; }
+		// construct goquery doc
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(res.Content)))
+		if err != nil {
+			log.Println("failed to process url", url, "with", err)
+			continue
+		}

-        // get publishing date
-        tag = doc.Find("meta[name='date']")
-        datestr, exists = tag.Attr("content")
-        if !exists { extractor.Responses.Processed(url); continue; }
+		// check for article type
+		tag = doc.Find("meta[property='og:type']")
+		pagetype, exists = tag.Attr("content")
+		if !exists || pagetype != "article" {
+			extractor.Responses.Processed(url)
+			continue
+		}
+
+		// get title
+		tag = doc.Find("meta[property='og:title']")
+		title, exists = tag.Attr("content")
+		if !exists {
+			extractor.Responses.Processed(url)
+			continue
+		}
+
+		// prepend description to content of article
+		tag = doc.Find("meta[name='description']")
+		content, exists = tag.Attr("content")
+		content += " "
+		if !exists {
+			extractor.Responses.Processed(url)
+			continue
+		}
+
+		// get publishing date
+		tag = doc.Find("meta[name='date']")
+		datestr, exists = tag.Attr("content")
+		if !exists {
+			extractor.Responses.Processed(url)
+			continue
+		}

 		date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
-        if err != nil { extractor.Responses.Processed(url); continue; }
+		if err != nil {
+			extractor.Responses.Processed(url)
+			continue
+		}

-	    // get author
-        tag = doc.Find("meta[name='author']")
-        author, exists = tag.Attr("content")
-        if !exists { extractor.Responses.Processed(url); continue; }
+		// get author
+		tag = doc.Find("meta[name='author']")
+		author, exists = tag.Attr("content")
+		if !exists {
+			extractor.Responses.Processed(url)
+			continue
+		}

-        // get content
-        tag = doc.Find("main[id='Inhalt'] div > p")
+		// get content
+		tag = doc.Find("main[id='Inhalt'] div > p")

-        tag.Each(func(index int, p *goquery.Selection) {
-            content += " " + p.Text()
-        })
-        
-        // clean up content string
-        content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
-        content = strings.ReplaceAll(content, "»", "\"")
-        content = strings.ReplaceAll(content, "«", "\"")
-        
-        // insert new article
-        article := model.Article{
-            SourceUrl: url,
-            PublishDate: date,
-            FetchDate: res.FetchDate,
-            Title: title,
-            Content: content,
-            Author: author,
-        }
-        
-        err = extractor.Articles.Insert(&article)
-        if err != nil {
-            log.Println("failed to insert", article)
-        } else {
-            extractor.Responses.Processed(url)
-            log.Println("found article at", url)
-        }
-    }
+		tag.Each(func(index int, p *goquery.Selection) {
+			content += " " + p.Text()
+		})

-    return nil
+		// clean up content string
+		content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
+		content = strings.ReplaceAll(content, "»", "\"")
+		content = strings.ReplaceAll(content, "«", "\"")
+
+		// insert new article
+		article := model.Article{
+			SourceUrl:   url,
+			PublishDate: date,
+			FetchDate:   res.FetchDate,
+			Title:       title,
+			Content:     content,
+			Author:      author,
+		}
+
+		err = extractor.Articles.Insert(&article)
+		if err != nil {
+			log.Println("failed to insert", article)
+		} else {
+			extractor.Responses.Processed(url)
+			log.Println("found article at", url)
+		}
+	}
+
+	return nil
 }