move collection and extraction of articles into their own file; add custom response cache

2025-01-04 16:53:46 +01:00
parent f31c1d7793
commit d63ee8dcf2
14 changed files with 519 additions and 105 deletions
@@ -0,0 +1,7 @@
+package collectors
+
+import "crowsnest/internal/model/sqlite"
+
+type Collector struct {
+    Responses *sqlite.ResponseModel
+}
@@ -0,0 +1,110 @@
+package collectors
+
+import (
+	//"crowsnest/internal/model"
+	//"regexp"
+	//"time"
+	//"strings"
+
+	"fmt"
+	"time"
+
+	"github.com/gocolly/colly/v2"
+)
+
+
+func (c *Collector) Spiegel() {
+	collycollector := colly.NewCollector(
+		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
+		colly.CacheDir("./persistence/spiegel_cache"),
+		colly.MaxDepth(3),
+	)
+
+	// cascade
+	collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
+		e.Request.Visit(e.Attr("href"))
+	})
+
+    // cache
+	collycollector.OnScraped(func(r *colly.Response) {
+        c.Responses.Insert(r.Request.URL.String(), string(r.Body))
+	})
+
+    // go through archive
+	startDate := time.Date(2000, time.January, 1, 0, 0, 0, 0, time.UTC)
+	currentDate := time.Now()
+
+	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) {
+		urlDate := date.Format("02.01.2006")
+		url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate)
+		
+        collycollector.Visit(url)
+	}
+
+	//// create entry if not behind paywall
+	//paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
+	//collycollector.OnResponse(func(r *colly.Response) {
+	//	if paywall_false_pattern.Match(r.Body) {
+	//		url := r.Request.URL.String()
+	//		(*results)[url] = &model.Article{
+	//			SourceUrl: url,
+	//			FetchDate: time.Now(),
+	//			Content:   "",
+	//		}
+	//	}
+
+	//})
+
+	//// check for article type
+	//collycollector.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) {
+	//	if e.Attr("content") != "article" {
+    //        delete(*results, e.Request.URL.String())
+	//	} 
+	//})
+
+	//// add title
+	//collycollector.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) {
+	//	if val, ok := (*results)[e.Request.URL.String()]; ok {
+	//		val.Title = e.Attr("content")
+	//	}
+	//})
+
+	//// prepend description to content of article
+	//collycollector.OnHTML("meta[name='description']", func(e *colly.HTMLElement) {
+	//	if val, ok := (*results)[e.Request.URL.String()]; ok {
+	//		val.Content = e.Attr("content") + val.Content
+	//	}
+	//})
+
+    //// add publishing date
+	//collycollector.OnHTML("meta[name='date']", func(e *colly.HTMLElement) {
+	//	if val, ok := (*results)[e.Request.URL.String()]; ok {
+	//		t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content"))
+	//		if err != nil {
+	//			panic(err)
+	//		}
+	//		val.PublishDate = t
+	//	}
+	//})
+
+	//// add author
+	//collycollector.OnHTML("meta[name='author']", func(e *colly.HTMLElement) {
+	//    if val, ok := (*results)[e.Request.URL.String()]; ok {
+	//        val.Author = e.Attr("content")
+	//    }
+	//})
+
+    //// add content
+	//collycollector.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) {
+	//	if val, ok := (*results)[e.Request.URL.String()]; ok {
+    //        cont := val.Content
+
+    //        pattern := regexp.MustCompile("\\s+")
+    //        cont = string(pattern.ReplaceAll([]byte(cont), []byte(" ")))
+    //        cont = strings.ReplaceAll(cont, "»", "\"")
+    //        cont = strings.ReplaceAll(cont, "«", "\"")
+	//		val.Content = cont + " " + e.Text
+	//	}
+	//})
+
+}
@@ -0,0 +1,39 @@
+package collectors
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/gocolly/colly/v2"
+)
+
+// Gets every page of the archive of zeit.de and stores the responses into the
+// database.
+func (c *Collector) Zeit() {
+	collycollector := colly.NewCollector(
+		colly.AllowedDomains("www.zeit.de", "zeit.de"),
+		colly.CacheDir("./persistence/zeit_cache"),
+		colly.MaxDepth(2),
+	)
+
+	// cascade
+	collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
+		e.Request.Visit(e.Attr("href"))
+	})
+
+	// cache
+	collycollector.OnScraped(func(r *colly.Response) {
+		c.Responses.Insert(r.Request.URL.String(), string(r.Body))
+	})
+
+	// go through archive
+	startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
+	currentDate := time.Now()
+
+	for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) {
+		year, week := date.ISOWeek()
+		url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week)
+
+		collycollector.Visit(url)
+	}
+}
@@ -0,0 +1,8 @@
+package extractors
+
+import "crowsnest/internal/model/sqlite"
+
+type Extractor struct {
+	Responses *sqlite.ResponseModel
+	Articles  *sqlite.ArticleModel
+}
@@ -0,0 +1,113 @@
+package extractors
+
+import (
+	"crowsnest/internal/model"
+	"log"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+
+func (extractor *Extractor) Spiegel() error {
+    // get urls to process
+    urls, err := extractor.Responses.UnprocessedUrls()
+    if err != nil { return err }
+
+	paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
+    url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*")
+    whitespace := regexp.MustCompile("\\s+")
+
+    var exists bool
+    var pagetype, title, content, datestr, author string
+    var tag *goquery.Selection
+    var date time.Time
+
+    for _, url := range urls {
+        // check url url pattern
+	    if !url_pattern.Match([]byte(url)) { continue }
+
+        // get response
+        res, err := extractor.Responses.GetByUrl(url)
+        if err != nil {
+            log.Println("failed to process url", url, "with", err)
+            continue
+        }
+        
+        // check for paywall
+	    if !paywall_false_pattern.Match([]byte(res.Content)) {
+            extractor.Responses.Processed(url)
+            continue
+	    }
+        
+        // construct goquery doc
+        doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content))
+        if err != nil {
+            log.Println("failed to process url", url, "with", err)
+            continue
+        }
+        
+        // check for article type
+        tag = doc.Find("meta[property='og:type']")
+        pagetype, exists = tag.Attr("content")
+        if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; }
+
+	    // get title
+        tag = doc.Find("meta[property='og:title']")
+        title, exists = tag.Attr("content")
+        if !exists { extractor.Responses.Processed(url); continue; }
+
+	    // prepend description to content of article
+        tag = doc.Find("meta[name='description']")
+        content, exists = tag.Attr("content")
+        content += " "
+        if !exists { extractor.Responses.Processed(url); continue; }
+
+        // get publishing date
+        tag = doc.Find("meta[name='date']")
+        datestr, exists = tag.Attr("content")
+        if !exists { extractor.Responses.Processed(url); continue; }
+
+		date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr)
+        if err != nil { extractor.Responses.Processed(url); continue; }
+
+	    // get author
+        tag = doc.Find("meta[name='author']")
+        author, exists = tag.Attr("content")
+        if !exists { extractor.Responses.Processed(url); continue; }
+
+        // get content
+        tag = doc.Find("main[id='Inhalt'] div > p")
+
+        tag.Each(func(index int, p *goquery.Selection) {
+            content += " " + p.Text()
+        })
+        
+        // clean up content string
+        content = string(whitespace.ReplaceAll([]byte(content), []byte(" ")))
+        content = strings.ReplaceAll(content, "»", "\"")
+        content = strings.ReplaceAll(content, "«", "\"")
+        
+        // insert new article
+        article := model.Article{
+            SourceUrl: url,
+            PublishDate: date,
+            FetchDate: res.FetchDate,
+            Title: title,
+            Content: content,
+            Author: author,
+        }
+        
+        err = extractor.Articles.Insert(&article)
+        if err != nil {
+            log.Println("failed to insert", article)
+        } else {
+            extractor.Responses.Processed(url)
+            log.Println("found article at", url)
+        }
+    }
+
+    return nil
+}
@@ -1,114 +1,35 @@
 package main

 import (
-	"crowsnest/internal/model"
+	"crowsnest/cmd/crawler/collectors"
+	"crowsnest/cmd/crawler/extractors"
 	"crowsnest/internal/model/sqlite"
 	"database/sql"
-	"fmt"
 	"log"
-	"regexp"
-	"strings"
-	"time"

-	"github.com/gocolly/colly/v2"
 	_ "github.com/mattn/go-sqlite3"
 )

-func spiegelCollector(results *map[string]*model.Article) *colly.Collector {
-	c := colly.NewCollector(
-		colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
-		colly.CacheDir("./persistence/spiegel_cache"),
-		colly.MaxDepth(5),
-	)
-
-	// create entry if not behind paywall
-	paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
-	c.OnResponse(func(r *colly.Response) {
-		if paywall_false_pattern.Match(r.Body) {
-			url := r.Request.URL.String()
-			(*results)[url] = &model.Article{
-				SourceUrl: url,
-				FetchDate: time.Now(),
-				Content:   "",
-			}
-		}
-
-	})
-
-	// check for article type
-	c.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) {
-		if e.Attr("content") != "article" {
-		} 
-	})
-
-	// add title
-	c.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) {
-		if val, ok := (*results)[e.Request.URL.String()]; ok {
-			val.Title = e.Attr("content")
-		}
-	})
-
-	// prepend description to content of article
-	c.OnHTML("meta[name='description']", func(e *colly.HTMLElement) {
-		if val, ok := (*results)[e.Request.URL.String()]; ok {
-			val.Content = e.Attr("content") + val.Content
-		}
-	})
-
-    // add publishing date
-	c.OnHTML("meta[name='date']", func(e *colly.HTMLElement) {
-		if val, ok := (*results)[e.Request.URL.String()]; ok {
-			t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content"))
-			if err != nil {
-				panic(err)
-			}
-			val.PublishDate = t
-		}
-	})
-
-	// add author
-	c.OnHTML("meta[name='author']", func(e *colly.HTMLElement) {
-	    if val, ok := (*results)[e.Request.URL.String()]; ok {
-	        val.Author = e.Attr("content")
-	    }
-	})
-
-    // add content
-	c.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) {
-		if val, ok := (*results)[e.Request.URL.String()]; ok {
-            cont := val.Content
-
-            pattern := regexp.MustCompile("\\s+")
-            cont = string(pattern.ReplaceAll([]byte(cont), []byte(" ")))
-            cont = strings.ReplaceAll(cont, "»", "\"")
-            cont = strings.ReplaceAll(cont, "«", "\"")
-			val.Content = cont + " " + e.Text
-		}
-	})
-
-	// cascade
-	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
-		e.Request.Visit(e.Attr("href"))
-	})
-
-	return c
-}
-
 func main() {
-	res := make(map[string]*model.Article)
-	c := spiegelCollector(&res)
-
-	c.Visit("https://www.spiegel.de/")
-
-    db, err := sql.Open("sqlite3", "./persistence/app.db")
-    if err != nil { log.Fatal(err) }
-
-    db_articles := &sqlite.ArticleModel{ DB: db }
-
-    counter := 0
-	for _, val := range res {
-        counter++
-        db_articles.Insert(val)
+	// open database
+	db, err := sql.Open("sqlite3", "./persistence/app.db")
+	if err != nil {
+		log.Fatal(err)
 	}
-    fmt.Println(counter)
+
+	// collect websites
+	_ = collectors.Collector{
+		Responses: &sqlite.ResponseModel{DB: db},
+	}
+
+	//coll.Spiegel()
+	//coll.Zeit()
+
+	// extract articles from websites
+	extr := extractors.Extractor{
+		Responses: &sqlite.ResponseModel{DB: db},
+		Articles:  &sqlite.ArticleModel{DB: db},
+	}
+
+    extr.Spiegel()
 }