restructuring crawler architecture

2025-01-20 08:58:49 +01:00
parent 9104bc7716
commit 47299d6ef3
13 changed files with 408 additions and 330 deletions
--- a/src/cmd/frontend/main.go
+++ b/src/cmd/frontend/main.go
@@ -4,10 +4,10 @@ import (
 	"crowsnest/internal/app"
 	"crowsnest/internal/crawler"
 	"crowsnest/internal/middleware"
 	"crowsnest/internal/model"
 	"crowsnest/internal/model/database"
 	"log"
 	"net/http"
 	"time"
 	_ "github.com/lib/pq"
 )
@@ -19,16 +19,16 @@ func main() {
 	}
 	// run web crawlers
-	coll := crawler.Crawler{
+    articles := &database.ArticleModel{DB: db}
-		Articles: &database.ArticleModel{DB: db},
+	crawler := crawler.CrawlerFacade{}
-	}
+	crawler.Init()
-	go func() {
+
-		for {
+	crawler.SubscribeToSpiegelFeed(func(a *model.Article) {
-			coll.ZeitCollectIndex()
+        articles.Insert(a)
-			coll.SpiegelCollectIndex()
+	})
-			time.Sleep(5 * time.Minute)
+	crawler.SubscribeToZeitFeed(func(a *model.Article) {
-		}
+        articles.Insert(a)
-	}()
+	})
 	// define app
 	webapp := app.NewApp(db)
--- a/src/internal/crawler/CrawlerFacade.go
+++ b/src/internal/crawler/CrawlerFacade.go
@@ -0,0 +1,69 @@
 package crawler
 import (
 	"crowsnest/internal/model"
 	"crowsnest/internal/util"
 	"github.com/gocolly/colly/v2"
 )
 type CrawlerFacade struct {
    spiegelFeedDistributer *util.Distributer[*model.Article]
    zeitFeedDistributer *util.Distributer[*model.Article]
 }
 func (cf *CrawlerFacade) Init() {
    // init
    cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{}
    cf.spiegelFeedDistributer.Init()
    cf.zeitFeedDistributer = &util.Distributer[*model.Article]{}
    cf.zeitFeedDistributer.Init()
    // run spiegel feed
    sf := &WebFeed{}
    sf.Init(
        "https://www.spiegel.de/",
 		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
 		colly.CacheDir("./persistence/spiegel_cache"),
 		colly.MaxDepth(1),
    )
    sf_feed := sf.Feed()
    sf_converter := ConverterSpiegel{}
    sf_converter.Init()
    go func() {
        for val := range sf_feed {
            article, err := sf_converter.Convert(val)
            if err != nil { continue }
            cf.spiegelFeedDistributer.Publish(article)
        }
    }()
    // run zeit feed
    zf := &WebFeed{}
    zf.Init(
        "https://www.zeit.de/index",
 		colly.AllowedDomains("www.zeit.de", "zeit.de"),
 		colly.CacheDir("./persistence/zeit_cache"),
 		colly.MaxDepth(1),
    )
    zf_feed := zf.Feed()
    zf_converter := ZeitConverter{}
    zf_converter.Init()
    go func() {
        for val := range zf_feed {
            article, err := zf_converter.Convert(val)
            if err != nil { continue }
            cf.zeitFeedDistributer.Publish(article)
        }
    }()
 }
 func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) {
    cf.spiegelFeedDistributer.Subscribe(hook)
 }
 func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) {
    cf.zeitFeedDistributer.Subscribe(hook)
 }
--- a/src/internal/crawler/Resource.go
+++ b/src/internal/crawler/Resource.go
@@ -0,0 +1,6 @@
 package crawler
 type Resource struct {
 	Url  string
 	Body string
 }
--- a/src/internal/crawler/SpiegelConverter.go
+++ b/src/internal/crawler/SpiegelConverter.go
@@ -0,0 +1,96 @@
 package crawler
 import (
 	"crowsnest/internal/model"
 	"errors"
 	"regexp"
 	"strings"
 	"time"
 	"github.com/PuerkitoBio/goquery"
 )
 type ConverterSpiegel struct {
 	pattern_paywall    *regexp.Regexp
 	pattern_url        *regexp.Regexp
 	pattern_whitespace *regexp.Regexp
 }
 func (c *ConverterSpiegel) Init() {
 	c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
 	c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
 	c.pattern_whitespace = regexp.MustCompile(`\s+`)
 }
 func (c *ConverterSpiegel) Convert(res *Resource) (*model.Article, error) {
 	// check url url pattern
 	if !c.pattern_url.Match([]byte(res.Url)) {
 		return nil, errors.New("invalid url pattern")
 	}
 	// check for paywall
 	if c.pattern_paywall.Match([]byte(res.Body)) {
 		return nil, errors.New("unable to extract article due to paywal")
 	}
 	// construct goquery doc
 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body))
 	if err != nil {
 		return nil, err
 	}
 	// check for article type
 	tag := doc.Find("meta[property='og:type']")
 	pagetype, exists := tag.Attr("content")
 	if !exists || pagetype != "article" {
 		return nil, errors.New("unable to extract article, not of type article")
 	}
 	// get title
 	tag = doc.Find("meta[property='og:title']")
 	title, exists := tag.Attr("content")
 	if !exists {
 		return nil, errors.New("unable to extract article, no title tag")
 	}
 	// prepend description to content of article
 	tag = doc.Find("meta[name='description']")
 	content, exists := tag.Attr("content")
 	content += " "
 	if !exists {
 		return nil, errors.New("unable to extract article, no description tag")
 	}
 	// get publishing date
 	tag = doc.Find("meta[name='date']")
 	datestr, exists := tag.Attr("content")
 	if !exists {
 		return nil, errors.New("unable to extract article, no date tag")
 	}
 	date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr)
 	if err != nil {
 		return nil, err
 	}
 	// get content
 	tag = doc.Find("main[id='Inhalt'] div > p")
 	tag.Each(func(index int, p *goquery.Selection) {
 		content += " " + p.Text()
 	})
 	// clean up content string
 	content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" ")))
 	content = strings.ReplaceAll(content, "»", "\"")
 	content = strings.ReplaceAll(content, "«", "\"")
 	// create new article
 	return &model.Article{
 		SourceUrl:   res.Url,
 		PublishDate: date,
 		FetchDate:   time.Now(),
 		Title:       title,
 		Content:     content,
 	}, nil
 }
--- a/src/internal/crawler/WebFeed.go
+++ b/src/internal/crawler/WebFeed.go
@@ -0,0 +1,67 @@
 package crawler
 import (
 	"crowsnest/internal/util"
 	"log"
 	"strings"
 	"time"
 	"github.com/gocolly/colly/v2"
 )
 type WebFeed struct {
 	feed      chan *Resource
 	collector *colly.Collector
 }
 // Init the WebFeed, starting the process of collecting Resources.
 func (sf *WebFeed) Init(indexUrl string, options ...colly.CollectorOption) {
 	// create feed
 	sf.feed = make(chan *Resource, 100)
 	// set cache, domain pattern and max recursion depth
 	sf.collector = colly.NewCollector(options...)
 	// return IResources aka pages
 	sf.collector.OnResponse(func(r *colly.Response) {
 		url := r.Request.URL.String()
 		body := string(r.Body)
 		sf.feed <- &Resource{Url: url, Body: body}
 	})
 	// cascade
 	sf.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		url := e.Attr("href")
 		if !strings.HasPrefix(url, "http") {
 			return
 		}
 		e.Request.Visit(url)
 	})
 	// start runner
 	go sf.runner(indexUrl)
 }
 // Get the channel into which the collected Resources will be written.
 func (sf *WebFeed) Feed() <-chan *Resource {
 	return sf.feed
 }
 func (sf *WebFeed) runner(indexUrl string) {
 	for {
 		// sleep for 5min
 		time.Sleep(time.Second * 300)
 		// collect index
 		urls, err := util.GetAllURLs(indexUrl)
 		if err != nil {
 			log.Println("error in WebFeed runner: ", err.Error())
 			continue
 		}
 		// visit urls
 		for _, url := range urls {
 			sf.collector.Visit(url)
 		}
 	}
 }
--- a/src/internal/crawler/ZeitConverter.go
+++ b/src/internal/crawler/ZeitConverter.go
@@ -0,0 +1,100 @@
 package crawler
 import (
 	"crowsnest/internal/model"
 	"errors"
 	"regexp"
 	"strings"
 	"time"
 	"github.com/PuerkitoBio/goquery"
 )
 type ZeitConverter struct {
 	pattern_url        *regexp.Regexp
 	pattern_whitespace *regexp.Regexp
 }
 func (c *ZeitConverter) Init() {
 	c.pattern_url = regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`)
 	c.pattern_whitespace = regexp.MustCompile(`\s+`)
 }
 func (c *ZeitConverter) Convert(res *Resource) (*model.Article, error) {
 	// check url url pattern
 	if !c.pattern_url.Match([]byte(res.Url)) {
 		return nil, errors.New("invalid url pattern")
 	}
 	// construct goquery doc
 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body))
 	if err != nil {
 		return nil, err
 	}
 	// check for article type
    tag := doc.Find("meta[property='og:type']")
    pagetype, exists := tag.Attr("content")
 	if !exists || pagetype != "article" {
 		return nil, errors.New("unable to extract article, not of type article")
 	}
 	// check for paywall
 	tag = doc.Find("meta[property='article:content_tier']")
 	pagetype, exists = tag.Attr("content")
 	if !exists || pagetype != "free" {
 		return nil, errors.New("unable to extract article due to paywal")
 	}
 	// get title
 	tag = doc.Find("meta[property='og:title']")
    title, exists := tag.Attr("content")
 	if !exists {
 		return nil, errors.New("unable to extract article, no title tag")
 	}
 	// prepend description to content of article
 	tag = doc.Find("meta[name='description']")
    content, exists := tag.Attr("content")
 	content += " "
 	if !exists {
 		return nil, errors.New("unable to extract article, no description tag")
 	}
 	if strings.Contains(content, "Das Liveblog") {
 		return nil, errors.New("unable to extract article, no support for liveblog")
 	}
 	// get publishing date
 	tag = doc.Find("meta[name='date']")
    datestr, exists := tag.Attr("content")
 	if !exists {
 		return nil, errors.New("unable to extract article, no date tag")
 	}
    date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr)
 	if err != nil {
 		return nil, err
 	}
 	// get content
 	tag = doc.Find("main > article > div.article-body p.article__item")
 	tag.Each(func(index int, p *goquery.Selection) {
 		content += " " + p.Text()
 	})
 	// clean up content string
 	content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" ")))
 	content = strings.ReplaceAll(content, "»", "\"")
 	content = strings.ReplaceAll(content, "«", "\"")
 	// create new article
 	return &model.Article{
 		SourceUrl:   res.Url,
 		PublishDate: date,
 		FetchDate:   time.Now(),
 		Title:       title,
 		Content:     content,
 	}, nil
 }
--- a/src/internal/crawler/spiegel.go
+++ b/src/internal/crawler/spiegel.go
@@ -1,152 +0,0 @@
 package crawler
 import (
 	"crowsnest/internal/model"
 	"errors"
 	"fmt"
 	"regexp"
 	"strings"
 	"time"
 	"github.com/PuerkitoBio/goquery"
 	"github.com/gocolly/colly/v2"
 )
 func (c *Crawler) SpiegelCollector() *colly.Collector {
 	// set cache, domain pattern and max recursion deepth
 	collector := colly.NewCollector(
 		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
 		colly.CacheDir("./persistence/spiegel_cache"),
 		colly.MaxDepth(5),
 	)
 	// store articles
 	collector.OnResponse(func(r *colly.Response) {
 		url := r.Request.URL.String()
 		c.SpiegelExtract(url, r.Body)
 	})
 	// cascade
 	collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		url := e.Attr("href")
 		if !strings.HasPrefix(url, "http") {
 			return
 		}
 		e.Request.Visit(url)
 	})
 	return collector
 }
 func (c *Crawler) SpiegelCollectIndex() error {
 	urls, err := c.GetAllURLs("https://www.spiegel.de/")
 	if err != nil {
 		return err
 	}
 	collector := c.SpiegelCollector()
 	collector.MaxDepth = 1
 	for _, url := range urls {
 		collector.Visit(url)
 	}
 	return nil
 }
 func (c *Crawler) SpiegelCollectArchive() {
 	collector := c.SpiegelCollector()
 	// go through archive
 	startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC)
 	currentDate := time.Now()
 	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) {
 		urlDate := date.Format("02.01.2006")
 		url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate)
 		collector.Visit(url)
 	}
 }
 func (c *Crawler) SpiegelExtract(url string, body []byte) error {
 	paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
 	url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
 	whitespace := regexp.MustCompile(`\s+`)
 	var exists bool
 	var pagetype, title, content, datestr string
 	var tag *goquery.Selection
 	var date time.Time
 	// check url url pattern
 	if !url_pattern.Match([]byte(url)) {
 		return errors.New("invalid url pattern")
 	}
 	// check for paywall
 	if paywall_pattern.Match(body) {
 		return errors.New("unable to extract article due to paywal")
 	}
 	// construct goquery doc
 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
 	if err != nil {
 		return err
 	}
 	// check for article type
 	tag = doc.Find("meta[property='og:type']")
 	pagetype, exists = tag.Attr("content")
 	if !exists || pagetype != "article" {
 		return errors.New("unable to extract article, not of type article")
 	}
 	// get title
 	tag = doc.Find("meta[property='og:title']")
 	title, exists = tag.Attr("content")
 	if !exists {
 		return errors.New("unable to extract article, no title tag")
 	}
 	// prepend description to content of article
 	tag = doc.Find("meta[name='description']")
 	content, exists = tag.Attr("content")
 	content += " "
 	if !exists {
 		return errors.New("unable to extract article, no description tag")
 	}
 	// get publishing date
 	tag = doc.Find("meta[name='date']")
 	datestr, exists = tag.Attr("content")
 	if !exists {
 		return errors.New("unable to extract article, no date tag")
 	}
 	date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
 	if err != nil {
 		return err
 	}
 	// get content
 	tag = doc.Find("main[id='Inhalt'] div > p")
 	tag.Each(func(index int, p *goquery.Selection) {
 		content += " " + p.Text()
 	})
 	// clean up content string
 	content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
 	content = strings.ReplaceAll(content, "»", "\"")
 	content = strings.ReplaceAll(content, "«", "\"")
 	// insert new article
 	article := model.Article{
 		SourceUrl:   url,
 		PublishDate: date,
 		FetchDate:   time.Now(),
 		Title:       title,
 		Content:     content,
 	}
 	err = c.Articles.Insert(&article)
 	return err
 }
--- a/src/internal/crawler/zeit.go
+++ b/src/internal/crawler/zeit.go
@@ -1,160 +0,0 @@
 package crawler
 import (
 	"crowsnest/internal/model"
 	"errors"
 	"fmt"
 	"regexp"
 	"strings"
 	"time"
 	"github.com/PuerkitoBio/goquery"
 	"github.com/gocolly/colly/v2"
 )
 func (c *Crawler) ZeitCollector() *colly.Collector {
 	// set cache, domain pattern and max recursion deepth
 	collector := colly.NewCollector(
 		colly.AllowedDomains("www.zeit.de", "zeit.de"),
 		colly.CacheDir("./persistence/zeit_cache"),
 		colly.MaxDepth(5),
 	)
 	// store articles
 	collector.OnResponse(func(r *colly.Response) {
 		url := r.Request.URL.String()
 		c.ZeitExtract(url, r.Body)
 	})
 	// cascade
 	collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		url := e.Attr("href")
 		if !strings.HasPrefix(url, "http") {
 			return
 		}
 		e.Request.Visit(url)
 	})
 	return collector
 }
 func (c *Crawler) ZeitCollectIndex() error {
 	urls, err := c.GetAllURLs("https://www.zeit.de/index")
 	if err != nil {
 		return err
 	}
 	collector := c.ZeitCollector()
 	collector.MaxDepth = 1
 	for _, url := range urls {
 		collector.Visit(url)
 	}
 	return nil
 }
 // Gets every page of the archive of zeit.de and stores the responses into the
 // database.
 func (c *Crawler) ZeitCollectArchive() {
 	collector := c.ZeitCollector()
 	// go through archive
 	startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC)
 	//startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
 	currentDate := time.Now()
 	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) {
 		year, week := date.ISOWeek()
 		url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week)
 		collector.Visit(url)
 	}
 }
 func (c *Crawler) ZeitExtract(url string, body []byte) error {
 	url_pattern := regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`)
 	whitespace := regexp.MustCompile(`\s+`)
 	var exists bool
 	var pagetype, title, content, datestr string
 	var tag *goquery.Selection
 	var date time.Time
 	// check url url pattern
 	if !url_pattern.Match([]byte(url)) {
 		return errors.New("invalid url pattern")
 	}
 	// construct goquery doc
 	doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
 	if err != nil {
 		return err
 	}
 	// check for article type
 	tag = doc.Find("meta[property='og:type']")
 	pagetype, exists = tag.Attr("content")
 	if !exists || pagetype != "article" {
 		return errors.New("unable to extract article, not of type article")
 	}
 	// check for paywall
 	tag = doc.Find("meta[property='article:content_tier']")
 	pagetype, exists = tag.Attr("content")
 	if !exists || pagetype != "free" {
 		return errors.New("unable to extract article due to paywal")
 	}
 	// get title
 	tag = doc.Find("meta[property='og:title']")
 	title, exists = tag.Attr("content")
 	if !exists {
 		return errors.New("unable to extract article, no title tag")
 	}
 	// prepend description to content of article
 	tag = doc.Find("meta[name='description']")
 	content, exists = tag.Attr("content")
 	content += " "
 	if !exists {
 		return errors.New("unable to extract article, no description tag")
 	}
 	if strings.Contains(content, "Das Liveblog") {
 		return errors.New("unable to extract article, no support for liveblog")
 	}
 	// get publishing date
 	tag = doc.Find("meta[name='date']")
 	datestr, exists = tag.Attr("content")
 	if !exists {
 		return errors.New("unable to extract article, no date tag")
 	}
 	date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
 	if err != nil {
 		return err
 	}
 	// get content
 	tag = doc.Find("main > article > div.article-body p.article__item")
 	tag.Each(func(index int, p *goquery.Selection) {
 		content += " " + p.Text()
 	})
 	// clean up content string
 	content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
 	content = strings.ReplaceAll(content, "»", "\"")
 	content = strings.ReplaceAll(content, "«", "\"")
 	// insert new article
 	article := model.Article{
 		SourceUrl:   url,
 		PublishDate: date,
 		FetchDate:   time.Now(),
 		Title:       title,
 		Content:     content,
 	}
 	err = c.Articles.Insert(&article)
 	return err
 }
--- a/src/internal/model/article.go
+++ b/src/internal/model/article.go
@@ -16,6 +16,18 @@ type Article struct {
 	AiSummary   string
 }
 func (a *Article) Clone() *Article {
 	return &Article{
 		Id:          a.Id,
 		SourceUrl:   a.SourceUrl,
 		PublishDate: a.PublishDate,
 		FetchDate:   a.FetchDate,
 		Title:       a.Title,
 		Content:     a.Content,
 		AiSummary:   a.AiSummary,
 	}
 }
 // TODO docstring
 type ArticleViewModel struct {
 	Id           int
--- a/src/internal/util/Converter.go
+++ b/src/internal/util/Converter.go
@@ -0,0 +1,3 @@
 package util
 type Converter[I any, O any] func(I) (O, error)
--- a/src/internal/util/Distributer.go
+++ b/src/internal/util/Distributer.go
@@ -0,0 +1,37 @@
 package util
 type Distributer[T IClone[T]] struct {
 	queue chan T
 	hooks []func(T)
 }
 func (d *Distributer[T]) Init() {
    d.queue = make(chan T, 100)
    d.hooks = make([]func(T), 0)
 }
 // Distribute a copy of an item to every hook that has described to this
 // Collector.
 func (d *Distributer[T]) Publish(item T) {
 	d.queue <- item
 }
 // Add a new hook to the Collector. The hook will be called async whenever a
 // new item is published.
 func (d *Distributer[T]) Subscribe(hook func(T)) {
 	d.hooks = append(d.hooks, hook)
 	if len(d.hooks) == 1 {
 		go d.runner()
 	}
 }
 // Will be started to run async when Subscribe is first called. Whenever
 // Publish is called the runner will distribute a clone of the new item to
 // every hook.
 func (d *Distributer[T]) runner() {
 	for val := range d.queue {
 		for _, f := range d.hooks {
 			go f(val.Clone())
 		}
 	}
 }
--- a/src/internal/util/IClone.go
+++ b/src/internal/util/IClone.go
@@ -0,0 +1,5 @@
 package util
 type IClone[T any] interface {
    Clone() T
 }
--- a/src/internal/crawler/crawler.go
+++ b/src/internal/crawler/crawler.go
@@ -1,19 +1,14 @@
-package crawler
+package util
 import (
 	"crowsnest/internal/model/database"
 	"fmt"
 	"net/http"
 	"github.com/PuerkitoBio/goquery"
 )
 type Crawler struct {
 	Articles *database.ArticleModel
 }
 // GetAllURLs fetches all URLs from a given web page URL
-func (c *Crawler) GetAllURLs(pageURL string) ([]string, error) {
+func GetAllURLs(pageURL string) ([]string, error) {
 	// Send a GET request to the provided URL
 	resp, err := http.Get(pageURL)
 	if err != nil {
		`@@ -0,0 +1,3 @@`
							`package util`

							`type Converter[I any, O any] func(I) (O, error)`