2025-01-20 21:29:36 +01:00
1 changed files with 0 additions and 67 deletions
--- a/src/internal/crawler/WebFeed.go
+++ b/src/internal/crawler/WebFeed.go
@@ -1,67 +0,0 @@
 package crawler
 import (
 	"crowsnest/internal/util"
 	"log"
 	"strings"
 	"time"
 	"github.com/gocolly/colly/v2"
 )
 type WebFeed struct {
 	feed      chan *Resource
 	collector *colly.Collector
 }
 // Init the WebFeed, starting the process of collecting Resources.
 func (sf *WebFeed) Init(indexUrl string, options ...colly.CollectorOption) {
 	// create feed
 	sf.feed = make(chan *Resource, 100)
 	// set cache, domain pattern and max recursion depth
 	sf.collector = colly.NewCollector(options...)
 	// return IResources aka pages
 	sf.collector.OnResponse(func(r *colly.Response) {
 		url := r.Request.URL.String()
 		body := string(r.Body)
 		sf.feed <- &Resource{Url: url, Body: body}
 	})
 	// cascade
 	sf.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		url := e.Attr("href")
 		if !strings.HasPrefix(url, "http") {
 			return
 		}
 		e.Request.Visit(url)
 	})
 	// start runner
 	go sf.runner(indexUrl)
 }
 // Get the channel into which the collected Resources will be written.
 func (sf *WebFeed) Feed() <-chan *Resource {
 	return sf.feed
 }
 func (sf *WebFeed) runner(indexUrl string) {
 	for {
 		// sleep for 5min
 		time.Sleep(time.Second * 300)
 		// collect index
 		urls, err := util.GetAllURLs(indexUrl)
 		if err != nil {
 			log.Println("error in WebFeed runner: ", err.Error())
 			continue
 		}
 		// visit urls
 		for _, url := range urls {
 			sf.collector.Visit(url)
 		}
 	}
 }