From d5f934783b2cc6234eea4f4cf0b433c0e8f0b4d6 Mon Sep 17 00:00:00 2001 From: Elias Kohout Date: Tue, 7 Jan 2025 10:14:16 +0100 Subject: [PATCH] remove caching in database for performance reasons --- .../20250107091120_remove_responses.sql | 14 ++ cmd/crawler/collectors/collector.go | 1 + cmd/crawler/collectors/spiegel.go | 120 +++++++++++++--- cmd/crawler/extractors/extractor.go | 8 -- cmd/crawler/extractors/spiegel.go | 134 ------------------ cmd/crawler/main.go | 19 +-- 6 files changed, 123 insertions(+), 173 deletions(-) create mode 100644 assets/migrations/20250107091120_remove_responses.sql delete mode 100644 cmd/crawler/extractors/extractor.go delete mode 100644 cmd/crawler/extractors/spiegel.go diff --git a/assets/migrations/20250107091120_remove_responses.sql b/assets/migrations/20250107091120_remove_responses.sql new file mode 100644 index 0000000..7e08d15 --- /dev/null +++ b/assets/migrations/20250107091120_remove_responses.sql @@ -0,0 +1,14 @@ +-- +goose Up +-- +goose StatementBegin +DROP TABLE IF EXISTS responses; +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +CREATE TABLE responses ( + url VARCHAR(255) NOT NULL UNIQUE PRIMARY KEY, + content BYTEA NOT NULL, + fetchDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + processed BOOLEAN DEFAULT FALSE +); +-- +goose StatementEnd diff --git a/cmd/crawler/collectors/collector.go b/cmd/crawler/collectors/collector.go index cb2461e..6635026 100644 --- a/cmd/crawler/collectors/collector.go +++ b/cmd/crawler/collectors/collector.go @@ -4,4 +4,5 @@ import "crowsnest/internal/model/database" type Collector struct { Responses *database.ResponseModel + Articles *database.ArticleModel } diff --git a/cmd/crawler/collectors/spiegel.go b/cmd/crawler/collectors/spiegel.go index 78ae73f..3f1dcbe 100644 --- a/cmd/crawler/collectors/spiegel.go +++ b/cmd/crawler/collectors/spiegel.go @@ -2,47 +2,42 @@ package collectors import ( "crowsnest/internal/model" + "errors" "fmt" "log" + "regexp" "strings" "time" + "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly/v2" ) -func (c *Collector) Spiegel() { +func (c *Collector) SpiegelCollect() { collycollector := colly.NewCollector( colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./persistence/spiegel_cache"), - colly.MaxDepth(2), + colly.MaxDepth(5), ) // cache - collycollector.OnRequest(func(r *colly.Request) { - url := r.URL.String() - exists, err := c.Responses.UrlExists(url) - if err == nil && !exists { - c.Responses.Insert(url, nil) - log.Println("request", url) - } else { - r.Abort() - } - }) collycollector.OnResponse(func(r *colly.Response) { url := r.Request.URL.String() - c.Responses.Update(&model.Response{Url: url, Content: r.Body, FetchDate: time.Now(), Processed: false}) - log.Println("response cached", url) + err := c.SpiegelExtract(url, r.Body) + if err == nil { + log.Println("added article", url) + } else { + log.Println("failed to add article:", err, "("+url+")") + } }) // cascade collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { url := e.Attr("href") - log.Println("found", url) if !strings.HasPrefix(url, "http") { return } - log.Println("visiting", url) e.Request.Visit(url) }) @@ -57,3 +52,96 @@ func (c *Collector) Spiegel() { collycollector.Visit(url) } } + +func (c *Collector) SpiegelExtract(url string, body []byte) error { + paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`) + url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`) + whitespace := regexp.MustCompile(`\s+`) + + var exists bool + var pagetype, title, content, datestr, author string + var tag *goquery.Selection + var date time.Time + + // check url url pattern + if !url_pattern.Match([]byte(url)) { + return errors.New("invalid url pattern") + } + + // check for paywall + if paywall_pattern.Match(body) { + return errors.New("unable to extract article due to paywal") + } + + // construct goquery doc + doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body))) + if err != nil { + return err + } + + // check for article type + tag = doc.Find("meta[property='og:type']") + pagetype, exists = tag.Attr("content") + if !exists || pagetype != "article" { + return errors.New("unable to extract article, not of type article") + } + + // get title + tag = doc.Find("meta[property='og:title']") + title, exists = tag.Attr("content") + if !exists { + return errors.New("unable to extract article, no title tag") + } + + // prepend description to content of article + tag = doc.Find("meta[name='description']") + content, exists = tag.Attr("content") + content += " " + if !exists { + return errors.New("unable to extract article, no description tag") + } + + // get publishing date + tag = doc.Find("meta[name='date']") + datestr, exists = tag.Attr("content") + if !exists { + return errors.New("unable to extract article, no date tag") + } + + date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) + if err != nil { + return err + } + + // get author + tag = doc.Find("meta[name='author']") + author, exists = tag.Attr("content") + if !exists { + return errors.New("unable to extract article, no author tag") + } + + // get content + tag = doc.Find("main[id='Inhalt'] div > p") + + tag.Each(func(index int, p *goquery.Selection) { + content += " " + p.Text() + }) + + // clean up content string + content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) + content = strings.ReplaceAll(content, "»", "\"") + content = strings.ReplaceAll(content, "«", "\"") + + // insert new article + article := model.Article{ + SourceUrl: url, + PublishDate: date, + FetchDate: time.Now(), + Title: title, + Content: content, + Author: author, + } + + err = c.Articles.Insert(&article) + return err +} diff --git a/cmd/crawler/extractors/extractor.go b/cmd/crawler/extractors/extractor.go deleted file mode 100644 index 6a58950..0000000 --- a/cmd/crawler/extractors/extractor.go +++ /dev/null @@ -1,8 +0,0 @@ -package extractors - -import "crowsnest/internal/model/database" - -type Extractor struct { - Responses *database.ResponseModel - Articles *database.ArticleModel -} diff --git a/cmd/crawler/extractors/spiegel.go b/cmd/crawler/extractors/spiegel.go deleted file mode 100644 index 9dcb21f..0000000 --- a/cmd/crawler/extractors/spiegel.go +++ /dev/null @@ -1,134 +0,0 @@ -package extractors - -import ( - "crowsnest/internal/model" - "log" - "regexp" - "strings" - "time" - - "github.com/PuerkitoBio/goquery" -) - -func (extractor *Extractor) Spiegel() error { - // get urls to process - urls, err := extractor.Responses.UnprocessedUrls() - if err != nil { - return err - } - - paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") - url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*") - whitespace := regexp.MustCompile("\\s+") - - var exists bool - var pagetype, title, content, datestr, author string - var tag *goquery.Selection - var date time.Time - - for _, url := range urls { - // check url url pattern - if !url_pattern.Match([]byte(url)) { - continue - } - - // get response - res, err := extractor.Responses.GetByUrl(url) - if err != nil { - log.Println("failed to process url", url, "with", err) - continue - } - - // check for paywall - if !paywall_false_pattern.Match([]byte(res.Content)) { - extractor.Responses.Processed(url) - continue - } - - // construct goquery doc - doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(res.Content))) - if err != nil { - log.Println("failed to process url", url, "with", err) - continue - } - - // check for article type - tag = doc.Find("meta[property='og:type']") - pagetype, exists = tag.Attr("content") - if !exists || pagetype != "article" { - extractor.Responses.Processed(url) - continue - } - - // get title - tag = doc.Find("meta[property='og:title']") - title, exists = tag.Attr("content") - if !exists { - extractor.Responses.Processed(url) - continue - } - - // prepend description to content of article - tag = doc.Find("meta[name='description']") - content, exists = tag.Attr("content") - content += " " - if !exists { - extractor.Responses.Processed(url) - continue - } - - // get publishing date - tag = doc.Find("meta[name='date']") - datestr, exists = tag.Attr("content") - if !exists { - extractor.Responses.Processed(url) - continue - } - - date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) - if err != nil { - extractor.Responses.Processed(url) - continue - } - - // get author - tag = doc.Find("meta[name='author']") - author, exists = tag.Attr("content") - if !exists { - extractor.Responses.Processed(url) - continue - } - - // get content - tag = doc.Find("main[id='Inhalt'] div > p") - - tag.Each(func(index int, p *goquery.Selection) { - content += " " + p.Text() - }) - - // clean up content string - content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) - content = strings.ReplaceAll(content, "»", "\"") - content = strings.ReplaceAll(content, "«", "\"") - - // insert new article - article := model.Article{ - SourceUrl: url, - PublishDate: date, - FetchDate: res.FetchDate, - Title: title, - Content: content, - Author: author, - } - - err = extractor.Articles.Insert(&article) - if err != nil { - log.Println("failed to insert", article) - } else { - extractor.Responses.Processed(url) - log.Println("found article at", url) - } - } - - return nil -} diff --git a/cmd/crawler/main.go b/cmd/crawler/main.go index 0acc7de..d9cea5a 100644 --- a/cmd/crawler/main.go +++ b/cmd/crawler/main.go @@ -1,12 +1,11 @@ package main import ( - "crowsnest/cmd/crawler/extractors" + "crowsnest/cmd/crawler/collectors" "crowsnest/internal/model/database" "database/sql" "log" "os" - "time" _ "github.com/lib/pq" ) @@ -23,21 +22,11 @@ func main() { defer db.Close() // collect websites - //coll := collectors.Collector{ - // Responses: &database.ResponseModel{DB: db}, - //} - - //coll.Spiegel() - //coll.Zeit() - - // extract articles from websites - extr := extractors.Extractor{ + coll := collectors.Collector{ Responses: &database.ResponseModel{DB: db}, Articles: &database.ArticleModel{DB: db}, } - for { - extr.Spiegel() - time.Sleep(5 * time.Second) - } + coll.SpiegelCollect() + //coll.Zeit() }