From a6f9a6aa0e1ce9259e299309b4b18a6ee3ee29c4 Mon Sep 17 00:00:00 2001 From: eliaskohout Date: Mon, 20 Jan 2025 20:27:01 +0000 Subject: [PATCH] rm webfeed.go --- src/internal/crawler/WebFeed.go | 67 --------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 src/internal/crawler/WebFeed.go diff --git a/src/internal/crawler/WebFeed.go b/src/internal/crawler/WebFeed.go deleted file mode 100644 index 7363d3e..0000000 --- a/src/internal/crawler/WebFeed.go +++ /dev/null @@ -1,67 +0,0 @@ -package crawler - -import ( - "crowsnest/internal/util" - "log" - "strings" - "time" - - "github.com/gocolly/colly/v2" -) - -type WebFeed struct { - feed chan *Resource - collector *colly.Collector -} - -// Init the WebFeed, starting the process of collecting Resources. -func (sf *WebFeed) Init(indexUrl string, options ...colly.CollectorOption) { - // create feed - sf.feed = make(chan *Resource, 100) - - // set cache, domain pattern and max recursion depth - sf.collector = colly.NewCollector(options...) - - // return IResources aka pages - sf.collector.OnResponse(func(r *colly.Response) { - url := r.Request.URL.String() - body := string(r.Body) - sf.feed <- &Resource{Url: url, Body: body} - }) - - // cascade - sf.collector.OnHTML("a[href]", func(e *colly.HTMLElement) { - url := e.Attr("href") - if !strings.HasPrefix(url, "http") { - return - } - e.Request.Visit(url) - }) - - // start runner - go sf.runner(indexUrl) -} - -// Get the channel into which the collected Resources will be written. -func (sf *WebFeed) Feed() <-chan *Resource { - return sf.feed -} - -func (sf *WebFeed) runner(indexUrl string) { - for { - // sleep for 5min - time.Sleep(time.Second * 300) - - // collect index - urls, err := util.GetAllURLs(indexUrl) - if err != nil { - log.Println("error in WebFeed runner: ", err.Error()) - continue - } - - // visit urls - for _, url := range urls { - sf.collector.Visit(url) - } - } -}