restructuring crawler architecture
This commit is contained in:
3
src/internal/util/Converter.go
Normal file
3
src/internal/util/Converter.go
Normal file
@@ -0,0 +1,3 @@
|
||||
package util
|
||||
|
||||
type Converter[I any, O any] func(I) (O, error)
|
||||
37
src/internal/util/Distributer.go
Normal file
37
src/internal/util/Distributer.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package util
|
||||
|
||||
type Distributer[T IClone[T]] struct {
|
||||
queue chan T
|
||||
hooks []func(T)
|
||||
}
|
||||
|
||||
func (d *Distributer[T]) Init() {
|
||||
d.queue = make(chan T, 100)
|
||||
d.hooks = make([]func(T), 0)
|
||||
}
|
||||
|
||||
// Distribute a copy of an item to every hook that has described to this
|
||||
// Collector.
|
||||
func (d *Distributer[T]) Publish(item T) {
|
||||
d.queue <- item
|
||||
}
|
||||
|
||||
// Add a new hook to the Collector. The hook will be called async whenever a
|
||||
// new item is published.
|
||||
func (d *Distributer[T]) Subscribe(hook func(T)) {
|
||||
d.hooks = append(d.hooks, hook)
|
||||
if len(d.hooks) == 1 {
|
||||
go d.runner()
|
||||
}
|
||||
}
|
||||
|
||||
// Will be started to run async when Subscribe is first called. Whenever
|
||||
// Publish is called the runner will distribute a clone of the new item to
|
||||
// every hook.
|
||||
func (d *Distributer[T]) runner() {
|
||||
for val := range d.queue {
|
||||
for _, f := range d.hooks {
|
||||
go f(val.Clone())
|
||||
}
|
||||
}
|
||||
}
|
||||
5
src/internal/util/IClone.go
Normal file
5
src/internal/util/IClone.go
Normal file
@@ -0,0 +1,5 @@
|
||||
package util
|
||||
|
||||
type IClone[T any] interface {
|
||||
Clone() T
|
||||
}
|
||||
43
src/internal/util/web.go
Normal file
43
src/internal/util/web.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package util
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// GetAllURLs fetches all URLs from a given web page URL
|
||||
func GetAllURLs(pageURL string) ([]string, error) {
|
||||
// Send a GET request to the provided URL
|
||||
resp, err := http.Get(pageURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to fetch URL %s: %w", pageURL, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Check if the response status is OK
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("HTTP request failed with status code %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Parse the HTML document
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse HTML: %w", err)
|
||||
}
|
||||
|
||||
// Slice to store the extracted URLs
|
||||
var urls []string
|
||||
|
||||
// Select all anchor tags and extract the href attribute
|
||||
doc.Find("a").Each(func(index int, element *goquery.Selection) {
|
||||
// Get the href attribute
|
||||
href, exists := element.Attr("href")
|
||||
if exists {
|
||||
urls = append(urls, href)
|
||||
}
|
||||
})
|
||||
|
||||
return urls, nil
|
||||
}
|
||||
Reference in New Issue
Block a user