restructuring crawler architecture

2025-01-20 08:58:49 +01:00
parent 9104bc7716
commit 47299d6ef3
13 changed files with 408 additions and 330 deletions
@@ -4,10 +4,10 @@ import (
 	"crowsnest/internal/app"
 	"crowsnest/internal/crawler"
 	"crowsnest/internal/middleware"
+	"crowsnest/internal/model"
 	"crowsnest/internal/model/database"
 	"log"
 	"net/http"
-	"time"

 	_ "github.com/lib/pq"
 )
@@ -19,16 +19,16 @@ func main() {
 	}

 	// run web crawlers
-	coll := crawler.Crawler{
-		Articles: &database.ArticleModel{DB: db},
-	}
-	go func() {
-		for {
-			coll.ZeitCollectIndex()
-			coll.SpiegelCollectIndex()
-			time.Sleep(5 * time.Minute)
-		}
-	}()
+    articles := &database.ArticleModel{DB: db}
+	crawler := crawler.CrawlerFacade{}
+	crawler.Init()
+
+	crawler.SubscribeToSpiegelFeed(func(a *model.Article) {
+        articles.Insert(a)
+	})
+	crawler.SubscribeToZeitFeed(func(a *model.Article) {
+        articles.Insert(a)
+	})

 	// define app
 	webapp := app.NewApp(db)
@@ -0,0 +1,69 @@
+package crawler
+
+import (
+	"crowsnest/internal/model"
+	"crowsnest/internal/util"
+
+	"github.com/gocolly/colly/v2"
+)
+
+type CrawlerFacade struct {
+    spiegelFeedDistributer *util.Distributer[*model.Article]
+    zeitFeedDistributer *util.Distributer[*model.Article]
+}
+
+func (cf *CrawlerFacade) Init() {
+    // init
+    cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{}
+    cf.spiegelFeedDistributer.Init()
+    cf.zeitFeedDistributer = &util.Distributer[*model.Article]{}
+    cf.zeitFeedDistributer.Init()
+    
+    // run spiegel feed
+    sf := &WebFeed{}
+    sf.Init(
+        "https://www.spiegel.de/",
+		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
+		colly.CacheDir("./persistence/spiegel_cache"),
+		colly.MaxDepth(1),
+    )
+    sf_feed := sf.Feed()
+    sf_converter := ConverterSpiegel{}
+    sf_converter.Init()
+
+    go func() {
+        for val := range sf_feed {
+            article, err := sf_converter.Convert(val)
+            if err != nil { continue }
+            cf.spiegelFeedDistributer.Publish(article)
+        }
+    }()
+
+    // run zeit feed
+    zf := &WebFeed{}
+    zf.Init(
+        "https://www.zeit.de/index",
+		colly.AllowedDomains("www.zeit.de", "zeit.de"),
+		colly.CacheDir("./persistence/zeit_cache"),
+		colly.MaxDepth(1),
+    )
+    zf_feed := zf.Feed()
+    zf_converter := ZeitConverter{}
+    zf_converter.Init()
+
+    go func() {
+        for val := range zf_feed {
+            article, err := zf_converter.Convert(val)
+            if err != nil { continue }
+            cf.zeitFeedDistributer.Publish(article)
+        }
+    }()
+}
+
+func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) {
+    cf.spiegelFeedDistributer.Subscribe(hook)
+}
+
+func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) {
+    cf.zeitFeedDistributer.Subscribe(hook)
+}
@@ -0,0 +1,6 @@
+package crawler
+
+type Resource struct {
+	Url  string
+	Body string
+}
@@ -0,0 +1,96 @@
+package crawler
+
+import (
+	"crowsnest/internal/model"
+	"errors"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+type ConverterSpiegel struct {
+	pattern_paywall    *regexp.Regexp
+	pattern_url        *regexp.Regexp
+	pattern_whitespace *regexp.Regexp
+}
+
+func (c *ConverterSpiegel) Init() {
+	c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
+	c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
+	c.pattern_whitespace = regexp.MustCompile(`\s+`)
+}
+
+func (c *ConverterSpiegel) Convert(res *Resource) (*model.Article, error) {
+	// check url url pattern
+	if !c.pattern_url.Match([]byte(res.Url)) {
+		return nil, errors.New("invalid url pattern")
+	}
+
+	// check for paywall
+	if c.pattern_paywall.Match([]byte(res.Body)) {
+		return nil, errors.New("unable to extract article due to paywal")
+	}
+
+	// construct goquery doc
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body))
+	if err != nil {
+		return nil, err
+	}
+
+	// check for article type
+	tag := doc.Find("meta[property='og:type']")
+	pagetype, exists := tag.Attr("content")
+	if !exists || pagetype != "article" {
+		return nil, errors.New("unable to extract article, not of type article")
+	}
+
+	// get title
+	tag = doc.Find("meta[property='og:title']")
+	title, exists := tag.Attr("content")
+	if !exists {
+		return nil, errors.New("unable to extract article, no title tag")
+	}
+
+	// prepend description to content of article
+	tag = doc.Find("meta[name='description']")
+	content, exists := tag.Attr("content")
+	content += " "
+	if !exists {
+		return nil, errors.New("unable to extract article, no description tag")
+	}
+
+	// get publishing date
+	tag = doc.Find("meta[name='date']")
+	datestr, exists := tag.Attr("content")
+	if !exists {
+		return nil, errors.New("unable to extract article, no date tag")
+	}
+
+	date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr)
+	if err != nil {
+		return nil, err
+	}
+
+	// get content
+	tag = doc.Find("main[id='Inhalt'] div > p")
+
+	tag.Each(func(index int, p *goquery.Selection) {
+		content += " " + p.Text()
+	})
+
+	// clean up content string
+	content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" ")))
+	content = strings.ReplaceAll(content, "»", "\"")
+	content = strings.ReplaceAll(content, "«", "\"")
+
+	// create new article
+	return &model.Article{
+		SourceUrl:   res.Url,
+		PublishDate: date,
+		FetchDate:   time.Now(),
+		Title:       title,
+		Content:     content,
+	}, nil
+}
@@ -0,0 +1,67 @@
+package crawler
+
+import (
+	"crowsnest/internal/util"
+	"log"
+	"strings"
+	"time"
+
+	"github.com/gocolly/colly/v2"
+)
+
+type WebFeed struct {
+	feed      chan *Resource
+	collector *colly.Collector
+}
+
+// Init the WebFeed, starting the process of collecting Resources.
+func (sf *WebFeed) Init(indexUrl string, options ...colly.CollectorOption) {
+	// create feed
+	sf.feed = make(chan *Resource, 100)
+
+	// set cache, domain pattern and max recursion depth
+	sf.collector = colly.NewCollector(options...)
+
+	// return IResources aka pages
+	sf.collector.OnResponse(func(r *colly.Response) {
+		url := r.Request.URL.String()
+		body := string(r.Body)
+		sf.feed <- &Resource{Url: url, Body: body}
+	})
+
+	// cascade
+	sf.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
+		url := e.Attr("href")
+		if !strings.HasPrefix(url, "http") {
+			return
+		}
+		e.Request.Visit(url)
+	})
+
+	// start runner
+	go sf.runner(indexUrl)
+}
+
+// Get the channel into which the collected Resources will be written.
+func (sf *WebFeed) Feed() <-chan *Resource {
+	return sf.feed
+}
+
+func (sf *WebFeed) runner(indexUrl string) {
+	for {
+		// sleep for 5min
+		time.Sleep(time.Second * 300)
+
+		// collect index
+		urls, err := util.GetAllURLs(indexUrl)
+		if err != nil {
+			log.Println("error in WebFeed runner: ", err.Error())
+			continue
+		}
+
+		// visit urls
+		for _, url := range urls {
+			sf.collector.Visit(url)
+		}
+	}
+}
@@ -0,0 +1,100 @@
+package crawler
+
+import (
+	"crowsnest/internal/model"
+	"errors"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+type ZeitConverter struct {
+	pattern_url        *regexp.Regexp
+	pattern_whitespace *regexp.Regexp
+}
+
+func (c *ZeitConverter) Init() {
+	c.pattern_url = regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`)
+	c.pattern_whitespace = regexp.MustCompile(`\s+`)
+}
+
+func (c *ZeitConverter) Convert(res *Resource) (*model.Article, error) {
+	// check url url pattern
+	if !c.pattern_url.Match([]byte(res.Url)) {
+		return nil, errors.New("invalid url pattern")
+	}
+
+	// construct goquery doc
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body))
+	if err != nil {
+		return nil, err
+	}
+
+	// check for article type
+    tag := doc.Find("meta[property='og:type']")
+    pagetype, exists := tag.Attr("content")
+	if !exists || pagetype != "article" {
+		return nil, errors.New("unable to extract article, not of type article")
+	}
+
+	// check for paywall
+	tag = doc.Find("meta[property='article:content_tier']")
+	pagetype, exists = tag.Attr("content")
+	if !exists || pagetype != "free" {
+		return nil, errors.New("unable to extract article due to paywal")
+	}
+
+	// get title
+	tag = doc.Find("meta[property='og:title']")
+    title, exists := tag.Attr("content")
+	if !exists {
+		return nil, errors.New("unable to extract article, no title tag")
+	}
+
+	// prepend description to content of article
+	tag = doc.Find("meta[name='description']")
+    content, exists := tag.Attr("content")
+	content += " "
+	if !exists {
+		return nil, errors.New("unable to extract article, no description tag")
+	}
+
+	if strings.Contains(content, "Das Liveblog") {
+		return nil, errors.New("unable to extract article, no support for liveblog")
+	}
+
+	// get publishing date
+	tag = doc.Find("meta[name='date']")
+    datestr, exists := tag.Attr("content")
+	if !exists {
+		return nil, errors.New("unable to extract article, no date tag")
+	}
+
+    date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr)
+	if err != nil {
+		return nil, err
+	}
+
+	// get content
+	tag = doc.Find("main > article > div.article-body p.article__item")
+
+	tag.Each(func(index int, p *goquery.Selection) {
+		content += " " + p.Text()
+	})
+
+	// clean up content string
+	content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" ")))
+	content = strings.ReplaceAll(content, "»", "\"")
+	content = strings.ReplaceAll(content, "«", "\"")
+
+	// create new article
+	return &model.Article{
+		SourceUrl:   res.Url,
+		PublishDate: date,
+		FetchDate:   time.Now(),
+		Title:       title,
+		Content:     content,
+	}, nil
+}
@@ -1,152 +0,0 @@
-package crawler
-
-import (
-	"crowsnest/internal/model"
-	"errors"
-	"fmt"
-	"regexp"
-	"strings"
-	"time"
-
-	"github.com/PuerkitoBio/goquery"
-	"github.com/gocolly/colly/v2"
-)
-
-func (c *Crawler) SpiegelCollector() *colly.Collector {
-	// set cache, domain pattern and max recursion deepth
-	collector := colly.NewCollector(
-		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
-		colly.CacheDir("./persistence/spiegel_cache"),
-		colly.MaxDepth(5),
-	)
-	// store articles
-	collector.OnResponse(func(r *colly.Response) {
-		url := r.Request.URL.String()
-		c.SpiegelExtract(url, r.Body)
-	})
-	// cascade
-	collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
-		url := e.Attr("href")
-		if !strings.HasPrefix(url, "http") {
-			return
-		}
-		e.Request.Visit(url)
-	})
-
-	return collector
-}
-
-func (c *Crawler) SpiegelCollectIndex() error {
-	urls, err := c.GetAllURLs("https://www.spiegel.de/")
-	if err != nil {
-		return err
-	}
-
-	collector := c.SpiegelCollector()
-	collector.MaxDepth = 1
-
-	for _, url := range urls {
-		collector.Visit(url)
-	}
-	return nil
-}
-
-func (c *Crawler) SpiegelCollectArchive() {
-	collector := c.SpiegelCollector()
-
-	// go through archive
-	startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC)
-	currentDate := time.Now()
-
-	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) {
-		urlDate := date.Format("02.01.2006")
-		url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate)
-
-		collector.Visit(url)
-	}
-}
-
-func (c *Crawler) SpiegelExtract(url string, body []byte) error {
-	paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
-	url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
-	whitespace := regexp.MustCompile(`\s+`)
-
-	var exists bool
-	var pagetype, title, content, datestr string
-	var tag *goquery.Selection
-	var date time.Time
-
-	// check url url pattern
-	if !url_pattern.Match([]byte(url)) {
-		return errors.New("invalid url pattern")
-	}
-
-	// check for paywall
-	if paywall_pattern.Match(body) {
-		return errors.New("unable to extract article due to paywal")
-	}
-
-	// construct goquery doc
-	doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
-	if err != nil {
-		return err
-	}
-
-	// check for article type
-	tag = doc.Find("meta[property='og:type']")
-	pagetype, exists = tag.Attr("content")
-	if !exists || pagetype != "article" {
-		return errors.New("unable to extract article, not of type article")
-	}
-
-	// get title
-	tag = doc.Find("meta[property='og:title']")
-	title, exists = tag.Attr("content")
-	if !exists {
-		return errors.New("unable to extract article, no title tag")
-	}
-
-	// prepend description to content of article
-	tag = doc.Find("meta[name='description']")
-	content, exists = tag.Attr("content")
-	content += " "
-	if !exists {
-		return errors.New("unable to extract article, no description tag")
-	}
-
-	// get publishing date
-	tag = doc.Find("meta[name='date']")
-	datestr, exists = tag.Attr("content")
-	if !exists {
-		return errors.New("unable to extract article, no date tag")
-	}
-
-	date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
-	if err != nil {
-		return err
-	}
-
-	// get content
-	tag = doc.Find("main[id='Inhalt'] div > p")
-
-	tag.Each(func(index int, p *goquery.Selection) {
-		content += " " + p.Text()
-	})
-
-	// clean up content string
-	content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
-	content = strings.ReplaceAll(content, "»", "\"")
-	content = strings.ReplaceAll(content, "«", "\"")
-
-	// insert new article
-	article := model.Article{
-		SourceUrl:   url,
-		PublishDate: date,
-		FetchDate:   time.Now(),
-		Title:       title,
-		Content:     content,
-	}
-
-	err = c.Articles.Insert(&article)
-	return err
-}
@@ -1,160 +0,0 @@
-package crawler
-
-import (
-	"crowsnest/internal/model"
-	"errors"
-	"fmt"
-	"regexp"
-	"strings"
-	"time"
-
-	"github.com/PuerkitoBio/goquery"
-	"github.com/gocolly/colly/v2"
-)
-
-func (c *Crawler) ZeitCollector() *colly.Collector {
-	// set cache, domain pattern and max recursion deepth
-	collector := colly.NewCollector(
-		colly.AllowedDomains("www.zeit.de", "zeit.de"),
-		colly.CacheDir("./persistence/zeit_cache"),
-		colly.MaxDepth(5),
-	)
-	// store articles
-	collector.OnResponse(func(r *colly.Response) {
-		url := r.Request.URL.String()
-		c.ZeitExtract(url, r.Body)
-	})
-	// cascade
-	collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
-		url := e.Attr("href")
-		if !strings.HasPrefix(url, "http") {
-			return
-		}
-		e.Request.Visit(url)
-	})
-
-	return collector
-}
-
-func (c *Crawler) ZeitCollectIndex() error {
-	urls, err := c.GetAllURLs("https://www.zeit.de/index")
-	if err != nil {
-		return err
-	}
-
-	collector := c.ZeitCollector()
-	collector.MaxDepth = 1
-
-	for _, url := range urls {
-		collector.Visit(url)
-	}
-	return nil
-}
-
-// Gets every page of the archive of zeit.de and stores the responses into the
-// database.
-func (c *Crawler) ZeitCollectArchive() {
-	collector := c.ZeitCollector()
-
-	// go through archive
-	startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC)
-	//startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
-	currentDate := time.Now()
-
-	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) {
-		year, week := date.ISOWeek()
-		url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week)
-
-		collector.Visit(url)
-	}
-}
-
-func (c *Crawler) ZeitExtract(url string, body []byte) error {
-	url_pattern := regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`)
-	whitespace := regexp.MustCompile(`\s+`)
-
-	var exists bool
-	var pagetype, title, content, datestr string
-	var tag *goquery.Selection
-	var date time.Time
-
-	// check url url pattern
-	if !url_pattern.Match([]byte(url)) {
-		return errors.New("invalid url pattern")
-	}
-
-	// construct goquery doc
-	doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
-	if err != nil {
-		return err
-	}
-
-	// check for article type
-	tag = doc.Find("meta[property='og:type']")
-	pagetype, exists = tag.Attr("content")
-	if !exists || pagetype != "article" {
-		return errors.New("unable to extract article, not of type article")
-	}
-
-	// check for paywall
-	tag = doc.Find("meta[property='article:content_tier']")
-	pagetype, exists = tag.Attr("content")
-	if !exists || pagetype != "free" {
-		return errors.New("unable to extract article due to paywal")
-	}
-
-	// get title
-	tag = doc.Find("meta[property='og:title']")
-	title, exists = tag.Attr("content")
-	if !exists {
-		return errors.New("unable to extract article, no title tag")
-	}
-
-	// prepend description to content of article
-	tag = doc.Find("meta[name='description']")
-	content, exists = tag.Attr("content")
-	content += " "
-	if !exists {
-		return errors.New("unable to extract article, no description tag")
-	}
-
-	if strings.Contains(content, "Das Liveblog") {
-		return errors.New("unable to extract article, no support for liveblog")
-	}
-
-	// get publishing date
-	tag = doc.Find("meta[name='date']")
-	datestr, exists = tag.Attr("content")
-	if !exists {
-		return errors.New("unable to extract article, no date tag")
-	}
-
-	date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
-	if err != nil {
-		return err
-	}
-
-	// get content
-	tag = doc.Find("main > article > div.article-body p.article__item")
-
-	tag.Each(func(index int, p *goquery.Selection) {
-		content += " " + p.Text()
-	})
-
-	// clean up content string
-	content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
-	content = strings.ReplaceAll(content, "»", "\"")
-	content = strings.ReplaceAll(content, "«", "\"")
-
-	// insert new article
-	article := model.Article{
-		SourceUrl:   url,
-		PublishDate: date,
-		FetchDate:   time.Now(),
-		Title:       title,
-		Content:     content,
-	}
-
-	err = c.Articles.Insert(&article)
-	return err
-}
@@ -16,6 +16,18 @@ type Article struct {
 	AiSummary   string
 }

+func (a *Article) Clone() *Article {
+	return &Article{
+		Id:          a.Id,
+		SourceUrl:   a.SourceUrl,
+		PublishDate: a.PublishDate,
+		FetchDate:   a.FetchDate,
+		Title:       a.Title,
+		Content:     a.Content,
+		AiSummary:   a.AiSummary,
+	}
+}
+
 // TODO docstring
 type ArticleViewModel struct {
 	Id           int
@@ -0,0 +1,3 @@
+package util
+
+type Converter[I any, O any] func(I) (O, error)
@@ -0,0 +1,37 @@
+package util
+
+type Distributer[T IClone[T]] struct {
+	queue chan T
+	hooks []func(T)
+}
+
+func (d *Distributer[T]) Init() {
+    d.queue = make(chan T, 100)
+    d.hooks = make([]func(T), 0)
+}
+
+// Distribute a copy of an item to every hook that has described to this
+// Collector.
+func (d *Distributer[T]) Publish(item T) {
+	d.queue <- item
+}
+
+// Add a new hook to the Collector. The hook will be called async whenever a
+// new item is published.
+func (d *Distributer[T]) Subscribe(hook func(T)) {
+	d.hooks = append(d.hooks, hook)
+	if len(d.hooks) == 1 {
+		go d.runner()
+	}
+}
+
+// Will be started to run async when Subscribe is first called. Whenever
+// Publish is called the runner will distribute a clone of the new item to
+// every hook.
+func (d *Distributer[T]) runner() {
+	for val := range d.queue {
+		for _, f := range d.hooks {
+			go f(val.Clone())
+		}
+	}
+}
@@ -0,0 +1,5 @@
+package util
+
+type IClone[T any] interface {
+    Clone() T
+}
@@ -1,19 +1,14 @@
-package crawler
+package util

 import (
-	"crowsnest/internal/model/database"
 	"fmt"
 	"net/http"

 	"github.com/PuerkitoBio/goquery"
 )

-type Crawler struct {
-	Articles *database.ArticleModel
-}
-
 // GetAllURLs fetches all URLs from a given web page URL
-func (c *Crawler) GetAllURLs(pageURL string) ([]string, error) {
+func GetAllURLs(pageURL string) ([]string, error) {
 	// Send a GET request to the provided URL
 	resp, err := http.Get(pageURL)
 	if err != nil {