restructuring crawler architecture

This commit is contained in:
2025-01-20 08:58:49 +01:00
parent 9104bc7716
commit 47299d6ef3
13 changed files with 408 additions and 330 deletions

View File

@@ -0,0 +1,3 @@
package util
type Converter[I any, O any] func(I) (O, error)

View File

@@ -0,0 +1,37 @@
package util
type Distributer[T IClone[T]] struct {
queue chan T
hooks []func(T)
}
func (d *Distributer[T]) Init() {
d.queue = make(chan T, 100)
d.hooks = make([]func(T), 0)
}
// Distribute a copy of an item to every hook that has described to this
// Collector.
func (d *Distributer[T]) Publish(item T) {
d.queue <- item
}
// Add a new hook to the Collector. The hook will be called async whenever a
// new item is published.
func (d *Distributer[T]) Subscribe(hook func(T)) {
d.hooks = append(d.hooks, hook)
if len(d.hooks) == 1 {
go d.runner()
}
}
// Will be started to run async when Subscribe is first called. Whenever
// Publish is called the runner will distribute a clone of the new item to
// every hook.
func (d *Distributer[T]) runner() {
for val := range d.queue {
for _, f := range d.hooks {
go f(val.Clone())
}
}
}

View File

@@ -0,0 +1,5 @@
package util
type IClone[T any] interface {
Clone() T
}

43
src/internal/util/web.go Normal file
View File

@@ -0,0 +1,43 @@
package util
import (
"fmt"
"net/http"
"github.com/PuerkitoBio/goquery"
)
// GetAllURLs fetches all URLs from a given web page URL
func GetAllURLs(pageURL string) ([]string, error) {
// Send a GET request to the provided URL
resp, err := http.Get(pageURL)
if err != nil {
return nil, fmt.Errorf("failed to fetch URL %s: %w", pageURL, err)
}
defer resp.Body.Close()
// Check if the response status is OK
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("HTTP request failed with status code %d", resp.StatusCode)
}
// Parse the HTML document
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}
// Slice to store the extracted URLs
var urls []string
// Select all anchor tags and extract the href attribute
doc.Find("a").Each(func(index int, element *goquery.Selection) {
// Get the href attribute
href, exists := element.Attr("href")
if exists {
urls = append(urls, href)
}
})
return urls, nil
}