remove caching in database for performance reasons

2025-01-07 10:14:16 +01:00
parent 9e7188be4f
commit d5f934783b
6 changed files with 123 additions and 173 deletions
--- a/assets/migrations/20250107091120_remove_responses.sql
+++ b/assets/migrations/20250107091120_remove_responses.sql
@@ -0,0 +1,14 @@
 -- +goose Up
 -- +goose StatementBegin
 DROP TABLE IF EXISTS responses;
 -- +goose StatementEnd
 -- +goose Down
 -- +goose StatementBegin
 CREATE TABLE responses (
    url             VARCHAR(255) NOT NULL UNIQUE PRIMARY KEY,
    content         BYTEA NOT NULL,
    fetchDate       TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    processed       BOOLEAN DEFAULT FALSE
 );
 -- +goose StatementEnd
--- a/cmd/crawler/collectors/collector.go
+++ b/cmd/crawler/collectors/collector.go
@@ -4,4 +4,5 @@ import "crowsnest/internal/model/database"
 type Collector struct {
 	Responses *database.ResponseModel
 	Articles  *database.ArticleModel
 }
--- a/cmd/crawler/collectors/spiegel.go
+++ b/cmd/crawler/collectors/spiegel.go
@@ -2,47 +2,42 @@ package collectors
 import (
 	"crowsnest/internal/model"
 	"errors"
 	"fmt"
 	"log"
 	"regexp"
 	"strings"
 	"time"
 	"github.com/PuerkitoBio/goquery"
 	"github.com/gocolly/colly/v2"
 )
-func (c *Collector) Spiegel() {
+func (c *Collector) SpiegelCollect() {
 	collycollector := colly.NewCollector(
 		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
 		colly.CacheDir("./persistence/spiegel_cache"),
-		colly.MaxDepth(2),
+		colly.MaxDepth(5),
 	)
 	// cache
 	collycollector.OnRequest(func(r *colly.Request) {
 		url := r.URL.String()
 		exists, err := c.Responses.UrlExists(url)
 		if err == nil && !exists {
 			c.Responses.Insert(url, nil)
 			log.Println("request", url)
 		} else {
 			r.Abort()
 		}
 	})
 	collycollector.OnResponse(func(r *colly.Response) {
 		url := r.Request.URL.String()
-		c.Responses.Update(&model.Response{Url: url, Content: r.Body, FetchDate: time.Now(), Processed: false})
+		err := c.SpiegelExtract(url, r.Body)
-		log.Println("response cached", url)
+		if err == nil {
 			log.Println("added article", url)
 		} else {
 			log.Println("failed to add article:", err, "("+url+")")
 		}
 	})
 	// cascade
 	collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		url := e.Attr("href")
 		log.Println("found", url)
 		if !strings.HasPrefix(url, "http") {
 			return
 		}
 		log.Println("visiting", url)
 		e.Request.Visit(url)
 	})
@@ -57,3 +52,96 @@ func (c *Collector) Spiegel() {
 		collycollector.Visit(url)
 	}
 }
 func (c *Collector) SpiegelExtract(url string, body []byte) error {
 	paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
 	url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
 	whitespace := regexp.MustCompile(`\s+`)
 	var exists bool
 	var pagetype, title, content, datestr, author string
 	var tag *goquery.Selection
 	var date time.Time
 	// check url url pattern
 	if !url_pattern.Match([]byte(url)) {
 		return errors.New("invalid url pattern")
 	}
 	// check for paywall
 	if paywall_pattern.Match(body) {
 		return errors.New("unable to extract article due to paywal")
 	}
 	// construct goquery doc
 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
 	if err != nil {
 		return err
 	}
 	// check for article type
 	tag = doc.Find("meta[property='og:type']")
 	pagetype, exists = tag.Attr("content")
 	if !exists || pagetype != "article" {
 		return errors.New("unable to extract article, not of type article")
 	}
 	// get title
 	tag = doc.Find("meta[property='og:title']")
 	title, exists = tag.Attr("content")
 	if !exists {
 		return errors.New("unable to extract article, no title tag")
 	}
 	// prepend description to content of article
 	tag = doc.Find("meta[name='description']")
 	content, exists = tag.Attr("content")
 	content += " "
 	if !exists {
 		return errors.New("unable to extract article, no description tag")
 	}
 	// get publishing date
 	tag = doc.Find("meta[name='date']")
 	datestr, exists = tag.Attr("content")
 	if !exists {
 		return errors.New("unable to extract article, no date tag")
 	}
 	date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
 	if err != nil {
 		return err
 	}
 	// get author
 	tag = doc.Find("meta[name='author']")
 	author, exists = tag.Attr("content")
 	if !exists {
 		return errors.New("unable to extract article, no author tag")
 	}
 	// get content
 	tag = doc.Find("main[id='Inhalt'] div > p")
 	tag.Each(func(index int, p *goquery.Selection) {
 		content += " " + p.Text()
 	})
 	// clean up content string
 	content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
 	content = strings.ReplaceAll(content, "»", "\"")
 	content = strings.ReplaceAll(content, "«", "\"")
 	// insert new article
 	article := model.Article{
 		SourceUrl:   url,
 		PublishDate: date,
 		FetchDate:   time.Now(),
 		Title:       title,
 		Content:     content,
 		Author:      author,
 	}
 	err = c.Articles.Insert(&article)
 	return err
 }
--- a/cmd/crawler/extractors/extractor.go
+++ b/cmd/crawler/extractors/extractor.go
@@ -1,8 +0,0 @@
 package extractors
 import "crowsnest/internal/model/database"
 type Extractor struct {
 	Responses *database.ResponseModel
 	Articles  *database.ArticleModel
 }
--- a/cmd/crawler/extractors/spiegel.go
+++ b/cmd/crawler/extractors/spiegel.go
@@ -1,134 +0,0 @@
 package extractors
 import (
 	"crowsnest/internal/model"
 	"log"
 	"regexp"
 	"strings"
 	"time"
 	"github.com/PuerkitoBio/goquery"
 )
 func (extractor *Extractor) Spiegel() error {
 	// get urls to process
 	urls, err := extractor.Responses.UnprocessedUrls()
 	if err != nil {
 		return err
 	}
 	paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
 	url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
 	whitespace := regexp.MustCompile("\\s+")
 	var exists bool
 	var pagetype, title, content, datestr, author string
 	var tag *goquery.Selection
 	var date time.Time
 	for _, url := range urls {
 		// check url url pattern
 		if !url_pattern.Match([]byte(url)) {
 			continue
 		}
 		// get response
 		res, err := extractor.Responses.GetByUrl(url)
 		if err != nil {
 			log.Println("failed to process url", url, "with", err)
 			continue
 		}
 		// check for paywall
 		if !paywall_false_pattern.Match([]byte(res.Content)) {
 			extractor.Responses.Processed(url)
 			continue
 		}
 		// construct goquery doc
 		doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(res.Content)))
 		if err != nil {
 			log.Println("failed to process url", url, "with", err)
 			continue
 		}
 		// check for article type
 		tag = doc.Find("meta[property='og:type']")
 		pagetype, exists = tag.Attr("content")
 		if !exists || pagetype != "article" {
 			extractor.Responses.Processed(url)
 			continue
 		}
 		// get title
 		tag = doc.Find("meta[property='og:title']")
 		title, exists = tag.Attr("content")
 		if !exists {
 			extractor.Responses.Processed(url)
 			continue
 		}
 		// prepend description to content of article
 		tag = doc.Find("meta[name='description']")
 		content, exists = tag.Attr("content")
 		content += " "
 		if !exists {
 			extractor.Responses.Processed(url)
 			continue
 		}
 		// get publishing date
 		tag = doc.Find("meta[name='date']")
 		datestr, exists = tag.Attr("content")
 		if !exists {
 			extractor.Responses.Processed(url)
 			continue
 		}
 		date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
 		if err != nil {
 			extractor.Responses.Processed(url)
 			continue
 		}
 		// get author
 		tag = doc.Find("meta[name='author']")
 		author, exists = tag.Attr("content")
 		if !exists {
 			extractor.Responses.Processed(url)
 			continue
 		}
 		// get content
 		tag = doc.Find("main[id='Inhalt'] div > p")
 		tag.Each(func(index int, p *goquery.Selection) {
 			content += " " + p.Text()
 		})
 		// clean up content string
 		content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
 		content = strings.ReplaceAll(content, "»", "\"")
 		content = strings.ReplaceAll(content, "«", "\"")
 		// insert new article
 		article := model.Article{
 			SourceUrl:   url,
 			PublishDate: date,
 			FetchDate:   res.FetchDate,
 			Title:       title,
 			Content:     content,
 			Author:      author,
 		}
 		err = extractor.Articles.Insert(&article)
 		if err != nil {
 			log.Println("failed to insert", article)
 		} else {
 			extractor.Responses.Processed(url)
 			log.Println("found article at", url)
 		}
 	}
 	return nil
 }
--- a/cmd/crawler/main.go
+++ b/cmd/crawler/main.go
@@ -1,12 +1,11 @@
 package main
 import (
-	"crowsnest/cmd/crawler/extractors"
+	"crowsnest/cmd/crawler/collectors"
 	"crowsnest/internal/model/database"
 	"database/sql"
 	"log"
 	"os"
 	"time"
 	_ "github.com/lib/pq"
 )
@@ -23,21 +22,11 @@ func main() {
 	defer db.Close()
 	// collect websites
-	//coll := collectors.Collector{
+	coll := collectors.Collector{
 	//	Responses: &database.ResponseModel{DB: db},
 	//}
 	//coll.Spiegel()
 	//coll.Zeit()
 	// extract articles from websites
 	extr := extractors.Extractor{
 		Responses: &database.ResponseModel{DB: db},
 		Articles:  &database.ArticleModel{DB: db},
 	}
-	for {
+	coll.SpiegelCollect()
-		extr.Spiegel()
+	//coll.Zeit()
 		time.Sleep(5 * time.Second)
 	}
 }