rename spiegel converter
This commit is contained in:
@@ -67,7 +67,7 @@ func main() {
|
|||||||
// define app
|
// define app
|
||||||
webapp := app.NewApp(db)
|
webapp := app.NewApp(db)
|
||||||
|
|
||||||
// create middle
|
// create middleware
|
||||||
stack := middleware.CreateStack(
|
stack := middleware.CreateStack(
|
||||||
middleware.Logging,
|
middleware.Logging,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -8,62 +8,66 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type CrawlerFacade struct {
|
type CrawlerFacade struct {
|
||||||
spiegelFeedDistributer *util.Distributer[*model.Article]
|
spiegelFeedDistributer *util.Distributer[*model.Article]
|
||||||
zeitFeedDistributer *util.Distributer[*model.Article]
|
zeitFeedDistributer *util.Distributer[*model.Article]
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cf *CrawlerFacade) Init() {
|
func (cf *CrawlerFacade) Init() {
|
||||||
// init
|
// init
|
||||||
cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{}
|
cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{}
|
||||||
cf.spiegelFeedDistributer.Init()
|
cf.spiegelFeedDistributer.Init()
|
||||||
cf.zeitFeedDistributer = &util.Distributer[*model.Article]{}
|
cf.zeitFeedDistributer = &util.Distributer[*model.Article]{}
|
||||||
cf.zeitFeedDistributer.Init()
|
cf.zeitFeedDistributer.Init()
|
||||||
|
|
||||||
// run spiegel feed
|
// run spiegel feed
|
||||||
sf := &WebFeed{}
|
sf := &WebFeed{}
|
||||||
sf.Init(
|
sf.Init(
|
||||||
"https://www.spiegel.de/",
|
"https://www.spiegel.de/",
|
||||||
colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
|
colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
|
||||||
colly.CacheDir("./persistence/spiegel_cache"),
|
colly.CacheDir("./persistence/spiegel_cache"),
|
||||||
colly.MaxDepth(1),
|
colly.MaxDepth(1),
|
||||||
)
|
)
|
||||||
sf_feed := sf.Feed()
|
sf_feed := sf.Feed()
|
||||||
sf_converter := ConverterSpiegel{}
|
sf_converter := SpiegelConverter{}
|
||||||
sf_converter.Init()
|
sf_converter.Init()
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for val := range sf_feed {
|
for val := range sf_feed {
|
||||||
article, err := sf_converter.Convert(val)
|
article, err := sf_converter.Convert(val)
|
||||||
if err != nil { continue }
|
if err != nil {
|
||||||
cf.spiegelFeedDistributer.Publish(article)
|
continue
|
||||||
}
|
}
|
||||||
}()
|
cf.spiegelFeedDistributer.Publish(article)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// run zeit feed
|
// run zeit feed
|
||||||
zf := &WebFeed{}
|
zf := &WebFeed{}
|
||||||
zf.Init(
|
zf.Init(
|
||||||
"https://www.zeit.de/index",
|
"https://www.zeit.de/index",
|
||||||
colly.AllowedDomains("www.zeit.de", "zeit.de"),
|
colly.AllowedDomains("www.zeit.de", "zeit.de"),
|
||||||
colly.CacheDir("./persistence/zeit_cache"),
|
colly.CacheDir("./persistence/zeit_cache"),
|
||||||
colly.MaxDepth(1),
|
colly.MaxDepth(1),
|
||||||
)
|
)
|
||||||
zf_feed := zf.Feed()
|
zf_feed := zf.Feed()
|
||||||
zf_converter := ZeitConverter{}
|
zf_converter := ZeitConverter{}
|
||||||
zf_converter.Init()
|
zf_converter.Init()
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for val := range zf_feed {
|
for val := range zf_feed {
|
||||||
article, err := zf_converter.Convert(val)
|
article, err := zf_converter.Convert(val)
|
||||||
if err != nil { continue }
|
if err != nil {
|
||||||
cf.zeitFeedDistributer.Publish(article)
|
continue
|
||||||
}
|
}
|
||||||
}()
|
cf.zeitFeedDistributer.Publish(article)
|
||||||
|
}
|
||||||
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) {
|
func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) {
|
||||||
cf.spiegelFeedDistributer.Subscribe(hook)
|
cf.spiegelFeedDistributer.Subscribe(hook)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) {
|
func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) {
|
||||||
cf.zeitFeedDistributer.Subscribe(hook)
|
cf.zeitFeedDistributer.Subscribe(hook)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,19 +10,19 @@ import (
|
|||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ConverterSpiegel struct {
|
type SpiegelConverter struct {
|
||||||
pattern_paywall *regexp.Regexp
|
pattern_paywall *regexp.Regexp
|
||||||
pattern_url *regexp.Regexp
|
pattern_url *regexp.Regexp
|
||||||
pattern_whitespace *regexp.Regexp
|
pattern_whitespace *regexp.Regexp
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ConverterSpiegel) Init() {
|
func (c *SpiegelConverter) Init() {
|
||||||
c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
|
c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
|
||||||
c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
|
c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
|
||||||
c.pattern_whitespace = regexp.MustCompile(`\s+`)
|
c.pattern_whitespace = regexp.MustCompile(`\s+`)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ConverterSpiegel) Convert(res *Resource) (*model.Article, error) {
|
func (c *SpiegelConverter) Convert(res *Resource) (*model.Article, error) {
|
||||||
// check url url pattern
|
// check url url pattern
|
||||||
if !c.pattern_url.Match([]byte(res.Url)) {
|
if !c.pattern_url.Match([]byte(res.Url)) {
|
||||||
return nil, errors.New("invalid url pattern")
|
return nil, errors.New("invalid url pattern")
|
||||||
|
|||||||
Reference in New Issue
Block a user