package main import ( "crowsnest/internal/model" "crowsnest/internal/data" "fmt" "regexp" "strings" "time" "github.com/gocolly/colly/v2" ) func spiegelCollector(results *map[string]*model.Article) *colly.Collector { c := colly.NewCollector( colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./spiegel_cache"), colly.MaxDepth(2), ) // create entry if not behind paywall paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") c.OnResponse(func(r *colly.Response) { if paywall_false_pattern.Match(r.Body) { url := r.Request.URL.String() (*results)[url] = &model.Article{ SourceUrl: url, FetchDate: time.Now(), Content: "", } } }) // check for article type c.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) { if e.Attr("content") != "article" { delete((*results), e.Request.URL.String()) } }) // add title c.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) { if val, ok := (*results)[e.Request.URL.String()]; ok { val.Title = e.Attr("content") } }) // prepend description to content of article c.OnHTML("meta[name='description']", func(e *colly.HTMLElement) { if val, ok := (*results)[e.Request.URL.String()]; ok { val.Content = e.Attr("content") + val.Content } }) // add publishing date c.OnHTML("meta[name='date']", func(e *colly.HTMLElement) { if val, ok := (*results)[e.Request.URL.String()]; ok { t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content")) if err != nil { panic(err) } val.PublishDate = t } }) // add author c.OnHTML("meta[name='author']", func(e *colly.HTMLElement) { if val, ok := (*results)[e.Request.URL.String()]; ok { val.Author = e.Attr("content") } }) // add content c.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) { if val, ok := (*results)[e.Request.URL.String()]; ok { cont := val.Content pattern := regexp.MustCompile("\\s+") cont = string(pattern.ReplaceAll([]byte(cont), []byte(" "))) cont = strings.ReplaceAll(cont, "»", "\"") cont = strings.ReplaceAll(cont, "«", "\"") val.Content = cont + " " + e.Text } }) // cascade c.OnHTML("a[href]", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) }) return c } func main() { res := make(map[string]*model.Article) c := spiegelCollector(&res) c.Visit("https://www.spiegel.de/") // data store fds, _ := data.NewFileDatastore("spiegel.json") repo, _ := data.NewDefaultRepository[*model.Article](fds, "article") counter := 0 for _, val := range res { counter++ repo.Create(val) } fmt.Println(counter) }