remove duplicate files #8

Merged
eliaskohout merged 12 commits from rm_duplicate_file into main 2025-01-20 21:29:36 +01:00
Showing only changes of commit a6f9a6aa0e - Show all commits

View File

@@ -1,67 +0,0 @@
package crawler
import (
"crowsnest/internal/util"
"log"
"strings"
"time"
"github.com/gocolly/colly/v2"
)
type WebFeed struct {
feed chan *Resource
collector *colly.Collector
}
// Init the WebFeed, starting the process of collecting Resources.
func (sf *WebFeed) Init(indexUrl string, options ...colly.CollectorOption) {
// create feed
sf.feed = make(chan *Resource, 100)
// set cache, domain pattern and max recursion depth
sf.collector = colly.NewCollector(options...)
// return IResources aka pages
sf.collector.OnResponse(func(r *colly.Response) {
url := r.Request.URL.String()
body := string(r.Body)
sf.feed <- &Resource{Url: url, Body: body}
})
// cascade
sf.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
url := e.Attr("href")
if !strings.HasPrefix(url, "http") {
return
}
e.Request.Visit(url)
})
// start runner
go sf.runner(indexUrl)
}
// Get the channel into which the collected Resources will be written.
func (sf *WebFeed) Feed() <-chan *Resource {
return sf.feed
}
func (sf *WebFeed) runner(indexUrl string) {
for {
// sleep for 5min
time.Sleep(time.Second * 300)
// collect index
urls, err := util.GetAllURLs(indexUrl)
if err != nil {
log.Println("error in WebFeed runner: ", err.Error())
continue
}
// visit urls
for _, url := range urls {
sf.collector.Visit(url)
}
}
}