From 48d8b99fc395f9465fbc5360e5837461054661ef Mon Sep 17 00:00:00 2001 From: Elias Kohout Date: Sun, 12 Jan 2025 03:35:19 +0100 Subject: [PATCH] run crawlers with the webapp --- src/cmd/crawler/collectors/collector.go | 7 --- src/cmd/crawler/main.go | 43 --------------- src/cmd/frontend/main.go | 14 +++++ src/internal/crawler/crawler.go | 48 ++++++++++++++++ .../crawler}/spiegel.go | 49 +++++++++++------ .../collectors => internal/crawler}/zeit.go | 55 ++++++++++++------- 6 files changed, 127 insertions(+), 89 deletions(-) delete mode 100644 src/cmd/crawler/collectors/collector.go delete mode 100644 src/cmd/crawler/main.go create mode 100644 src/internal/crawler/crawler.go rename src/{cmd/crawler/collectors => internal/crawler}/spiegel.go (79%) rename src/{cmd/crawler/collectors => internal/crawler}/zeit.go (79%) diff --git a/src/cmd/crawler/collectors/collector.go b/src/cmd/crawler/collectors/collector.go deleted file mode 100644 index b3967f3..0000000 --- a/src/cmd/crawler/collectors/collector.go +++ /dev/null @@ -1,7 +0,0 @@ -package collectors - -import "crowsnest/internal/model/database" - -type Collector struct { - Articles *database.ArticleModel -} diff --git a/src/cmd/crawler/main.go b/src/cmd/crawler/main.go deleted file mode 100644 index a08fa65..0000000 --- a/src/cmd/crawler/main.go +++ /dev/null @@ -1,43 +0,0 @@ -package main - -import ( - "crowsnest/cmd/crawler/collectors" - "crowsnest/internal/model/database" - "database/sql" - "log" - "os" - "sync" - - _ "github.com/lib/pq" -) - -func main() { - // collect environement variables - databaseURL := os.Getenv("DB_URL") - - // connect to database - db, err := sql.Open("postgres", databaseURL) - if err != nil { - log.Fatal(err) - } - defer db.Close() - - // collect websites - coll := collectors.Collector{ - Articles: &database.ArticleModel{DB: db}, - } - - var wg sync.WaitGroup - wg.Add(2) - - go func() { - defer wg.Done() - coll.CollectSpiegel() - }() - go func() { - defer wg.Done() - go coll.CollectZeit() - }() - - wg.Wait() -} diff --git a/src/cmd/frontend/main.go b/src/cmd/frontend/main.go index b4da5aa..e0a79fa 100644 --- a/src/cmd/frontend/main.go +++ b/src/cmd/frontend/main.go @@ -2,10 +2,12 @@ package main import ( "crowsnest/internal/app" + "crowsnest/internal/crawler" "crowsnest/internal/middleware" "crowsnest/internal/model/database" "log" "net/http" + "time" _ "github.com/lib/pq" ) @@ -16,6 +18,18 @@ func main() { log.Fatal("failed to connect to database due to", err.Error()) } + // run web crawlers + coll := crawler.Crawler{ + Articles: &database.ArticleModel{DB: db}, + } + go func() { + for { + coll.ZeitCollectIndex() + coll.SpiegelCollectIndex() + time.Sleep(5 * time.Minute) + } + }() + // define app webapp := app.NewApp(db) diff --git a/src/internal/crawler/crawler.go b/src/internal/crawler/crawler.go new file mode 100644 index 0000000..80fee97 --- /dev/null +++ b/src/internal/crawler/crawler.go @@ -0,0 +1,48 @@ +package crawler + +import ( + "crowsnest/internal/model/database" + "fmt" + "net/http" + + "github.com/PuerkitoBio/goquery" +) + +type Crawler struct { + Articles *database.ArticleModel +} + +// GetAllURLs fetches all URLs from a given web page URL +func (c *Crawler) GetAllURLs(pageURL string) ([]string, error) { + // Send a GET request to the provided URL + resp, err := http.Get(pageURL) + if err != nil { + return nil, fmt.Errorf("failed to fetch URL %s: %w", pageURL, err) + } + defer resp.Body.Close() + + // Check if the response status is OK + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP request failed with status code %d", resp.StatusCode) + } + + // Parse the HTML document + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML: %w", err) + } + + // Slice to store the extracted URLs + var urls []string + + // Select all anchor tags and extract the href attribute + doc.Find("a").Each(func(index int, element *goquery.Selection) { + // Get the href attribute + href, exists := element.Attr("href") + if exists { + urls = append(urls, href) + } + }) + + return urls, nil +} diff --git a/src/cmd/crawler/collectors/spiegel.go b/src/internal/crawler/spiegel.go similarity index 79% rename from src/cmd/crawler/collectors/spiegel.go rename to src/internal/crawler/spiegel.go index 5649e09..2ae5a1c 100644 --- a/src/cmd/crawler/collectors/spiegel.go +++ b/src/internal/crawler/spiegel.go @@ -1,10 +1,9 @@ -package collectors +package crawler import ( "crowsnest/internal/model" "errors" "fmt" - "log" "regexp" "strings" "time" @@ -13,34 +12,48 @@ import ( "github.com/gocolly/colly/v2" ) -func (c *Collector) CollectSpiegel() { - collycollector := colly.NewCollector( +func (c *Crawler) SpiegelCollector() *colly.Collector { + // set cache, domain pattern and max recursion deepth + collector := colly.NewCollector( colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./persistence/spiegel_cache"), - colly.MaxDepth(3), + colly.MaxDepth(5), ) - // store articles - collycollector.OnResponse(func(r *colly.Response) { + collector.OnResponse(func(r *colly.Response) { url := r.Request.URL.String() - err := c.ExtractSpiegel(url, r.Body) - if err == nil { - log.Println("added article", url) - } else { - log.Println("failed to add article:", err, "("+url+")") - } + c.SpiegelExtract(url, r.Body) }) - // cascade - collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { + collector.OnHTML("a[href]", func(e *colly.HTMLElement) { url := e.Attr("href") - if !strings.HasPrefix(url, "http") { return } e.Request.Visit(url) }) + return collector +} + +func (c *Crawler) SpiegelCollectIndex() error { + urls, err := c.GetAllURLs("https://www.spiegel.de/") + if err != nil { + return err + } + + collector := c.SpiegelCollector() + collector.MaxDepth = 1 + + for _, url := range urls { + collector.Visit(url) + } + return nil +} + +func (c *Crawler) SpiegelCollectArchive() { + collector := c.SpiegelCollector() + // go through archive startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC) currentDate := time.Now() @@ -49,11 +62,11 @@ func (c *Collector) CollectSpiegel() { urlDate := date.Format("02.01.2006") url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate) - collycollector.Visit(url) + collector.Visit(url) } } -func (c *Collector) ExtractSpiegel(url string, body []byte) error { +func (c *Crawler) SpiegelExtract(url string, body []byte) error { paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`) url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`) whitespace := regexp.MustCompile(`\s+`) diff --git a/src/cmd/crawler/collectors/zeit.go b/src/internal/crawler/zeit.go similarity index 79% rename from src/cmd/crawler/collectors/zeit.go rename to src/internal/crawler/zeit.go index 88e7056..90d4e5e 100644 --- a/src/cmd/crawler/collectors/zeit.go +++ b/src/internal/crawler/zeit.go @@ -1,10 +1,9 @@ -package collectors +package crawler import ( "crowsnest/internal/model" "errors" "fmt" - "log" "regexp" "strings" "time" @@ -13,36 +12,50 @@ import ( "github.com/gocolly/colly/v2" ) -// Gets every page of the archive of zeit.de and stores the responses into the -// database. -func (c *Collector) CollectZeit() { - collycollector := colly.NewCollector( +func (c *Crawler) ZeitCollector() *colly.Collector { + // set cache, domain pattern and max recursion deepth + collector := colly.NewCollector( colly.AllowedDomains("www.zeit.de", "zeit.de"), colly.CacheDir("./persistence/zeit_cache"), - colly.MaxDepth(3), + colly.MaxDepth(5), ) - // store articles - collycollector.OnResponse(func(r *colly.Response) { + collector.OnResponse(func(r *colly.Response) { url := r.Request.URL.String() - err := c.ExtractZeit(url, r.Body) - if err == nil { - log.Println("added article", url) - } else { - log.Println("failed to add article:", err, "("+url+")") - } + c.ZeitExtract(url, r.Body) }) - // cascade - collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { + collector.OnHTML("a[href]", func(e *colly.HTMLElement) { url := e.Attr("href") - if !strings.HasPrefix(url, "http") { return } e.Request.Visit(url) }) + return collector +} + +func (c *Crawler) ZeitCollectIndex() error { + urls, err := c.GetAllURLs("https://www.zeit.de/index") + if err != nil { + return err + } + + collector := c.ZeitCollector() + collector.MaxDepth = 1 + + for _, url := range urls { + collector.Visit(url) + } + return nil +} + +// Gets every page of the archive of zeit.de and stores the responses into the +// database. +func (c *Crawler) ZeitCollectArchive() { + collector := c.ZeitCollector() + // go through archive startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC) //startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC) @@ -52,12 +65,12 @@ func (c *Collector) CollectZeit() { year, week := date.ISOWeek() url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week) - collycollector.Visit(url) + collector.Visit(url) } } -func (c *Collector) ExtractZeit(url string, body []byte) error { - url_pattern := regexp.MustCompile(`^https://(www\.)?zeit.de.*`) +func (c *Crawler) ZeitExtract(url string, body []byte) error { + url_pattern := regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`) whitespace := regexp.MustCompile(`\s+`) var exists bool