package extractors import ( "crowsnest/internal/model" "log" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" ) func (extractor *Extractor) Spiegel() error { // get urls to process urls, err := extractor.Responses.UnprocessedUrls() if err != nil { return err } paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*") whitespace := regexp.MustCompile("\\s+") var exists bool var pagetype, title, content, datestr, author string var tag *goquery.Selection var date time.Time for _, url := range urls { // check url url pattern if !url_pattern.Match([]byte(url)) { continue } // get response res, err := extractor.Responses.GetByUrl(url) if err != nil { log.Println("failed to process url", url, "with", err) continue } // check for paywall if !paywall_false_pattern.Match([]byte(res.Content)) { extractor.Responses.Processed(url) continue } // construct goquery doc doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content)) if err != nil { log.Println("failed to process url", url, "with", err) continue } // check for article type tag = doc.Find("meta[property='og:type']") pagetype, exists = tag.Attr("content") if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; } // get title tag = doc.Find("meta[property='og:title']") title, exists = tag.Attr("content") if !exists { extractor.Responses.Processed(url); continue; } // prepend description to content of article tag = doc.Find("meta[name='description']") content, exists = tag.Attr("content") content += " " if !exists { extractor.Responses.Processed(url); continue; } // get publishing date tag = doc.Find("meta[name='date']") datestr, exists = tag.Attr("content") if !exists { extractor.Responses.Processed(url); continue; } date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) if err != nil { extractor.Responses.Processed(url); continue; } // get author tag = doc.Find("meta[name='author']") author, exists = tag.Attr("content") if !exists { extractor.Responses.Processed(url); continue; } // get content tag = doc.Find("main[id='Inhalt'] div > p") tag.Each(func(index int, p *goquery.Selection) { content += " " + p.Text() }) // clean up content string content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) content = strings.ReplaceAll(content, "»", "\"") content = strings.ReplaceAll(content, "«", "\"") // insert new article article := model.Article{ SourceUrl: url, PublishDate: date, FetchDate: res.FetchDate, Title: title, Content: content, Author: author, } err = extractor.Articles.Insert(&article) if err != nil { log.Println("failed to insert", article) } else { extractor.Responses.Processed(url) log.Println("found article at", url) } } return nil }