add zeit.de collector
This commit is contained in:
@@ -13,17 +13,17 @@ import (
|
||||
"github.com/gocolly/colly/v2"
|
||||
)
|
||||
|
||||
func (c *Collector) SpiegelCollect() {
|
||||
func (c *Collector) CollectSpiegel() {
|
||||
collycollector := colly.NewCollector(
|
||||
colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
|
||||
colly.CacheDir("./persistence/spiegel_cache"),
|
||||
colly.MaxDepth(5),
|
||||
)
|
||||
|
||||
// cache
|
||||
// store articles
|
||||
collycollector.OnResponse(func(r *colly.Response) {
|
||||
url := r.Request.URL.String()
|
||||
err := c.SpiegelExtract(url, r.Body)
|
||||
err := c.ExtractSpiegel(url, r.Body)
|
||||
if err == nil {
|
||||
log.Println("added article", url)
|
||||
} else {
|
||||
@@ -53,7 +53,7 @@ func (c *Collector) SpiegelCollect() {
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Collector) SpiegelExtract(url string, body []byte) error {
|
||||
func (c *Collector) ExtractSpiegel(url string, body []byte) error {
|
||||
paywall_pattern := regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
|
||||
url_pattern := regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
|
||||
whitespace := regexp.MustCompile(`\s+`)
|
||||
|
||||
Reference in New Issue
Block a user