rm webfeed.go
This commit is contained in:
@@ -1,67 +0,0 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"crowsnest/internal/util"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
)
|
||||
|
||||
type WebFeed struct {
|
||||
feed chan *Resource
|
||||
collector *colly.Collector
|
||||
}
|
||||
|
||||
// Init the WebFeed, starting the process of collecting Resources.
|
||||
func (sf *WebFeed) Init(indexUrl string, options ...colly.CollectorOption) {
|
||||
// create feed
|
||||
sf.feed = make(chan *Resource, 100)
|
||||
|
||||
// set cache, domain pattern and max recursion depth
|
||||
sf.collector = colly.NewCollector(options...)
|
||||
|
||||
// return IResources aka pages
|
||||
sf.collector.OnResponse(func(r *colly.Response) {
|
||||
url := r.Request.URL.String()
|
||||
body := string(r.Body)
|
||||
sf.feed <- &Resource{Url: url, Body: body}
|
||||
})
|
||||
|
||||
// cascade
|
||||
sf.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
url := e.Attr("href")
|
||||
if !strings.HasPrefix(url, "http") {
|
||||
return
|
||||
}
|
||||
e.Request.Visit(url)
|
||||
})
|
||||
|
||||
// start runner
|
||||
go sf.runner(indexUrl)
|
||||
}
|
||||
|
||||
// Get the channel into which the collected Resources will be written.
|
||||
func (sf *WebFeed) Feed() <-chan *Resource {
|
||||
return sf.feed
|
||||
}
|
||||
|
||||
func (sf *WebFeed) runner(indexUrl string) {
|
||||
for {
|
||||
// sleep for 5min
|
||||
time.Sleep(time.Second * 300)
|
||||
|
||||
// collect index
|
||||
urls, err := util.GetAllURLs(indexUrl)
|
||||
if err != nil {
|
||||
log.Println("error in WebFeed runner: ", err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
// visit urls
|
||||
for _, url := range urls {
|
||||
sf.collector.Visit(url)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user