diff --git a/cmd/crawler/collectors/collector.go b/cmd/crawler/collectors/collector.go index 6635026..b3967f3 100644 --- a/cmd/crawler/collectors/collector.go +++ b/cmd/crawler/collectors/collector.go @@ -3,6 +3,5 @@ package collectors import "crowsnest/internal/model/database" type Collector struct { - Responses *database.ResponseModel - Articles *database.ArticleModel + Articles *database.ArticleModel } diff --git a/cmd/crawler/collectors/spiegel.go b/cmd/crawler/collectors/spiegel.go index 40e8644..5649e09 100644 --- a/cmd/crawler/collectors/spiegel.go +++ b/cmd/crawler/collectors/spiegel.go @@ -17,7 +17,7 @@ func (c *Collector) CollectSpiegel() { collycollector := colly.NewCollector( colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./persistence/spiegel_cache"), - colly.MaxDepth(5), + colly.MaxDepth(3), ) // store articles @@ -42,7 +42,7 @@ func (c *Collector) CollectSpiegel() { }) // go through archive - startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) + startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC) currentDate := time.Now() for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) { diff --git a/cmd/crawler/collectors/zeit.go b/cmd/crawler/collectors/zeit.go index 606b988..88e7056 100644 --- a/cmd/crawler/collectors/zeit.go +++ b/cmd/crawler/collectors/zeit.go @@ -19,7 +19,7 @@ func (c *Collector) CollectZeit() { collycollector := colly.NewCollector( colly.AllowedDomains("www.zeit.de", "zeit.de"), colly.CacheDir("./persistence/zeit_cache"), - colly.MaxDepth(5), + colly.MaxDepth(3), ) // store articles @@ -44,7 +44,7 @@ func (c *Collector) CollectZeit() { }) // go through archive - startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) + startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC) //startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC) currentDate := time.Now() diff --git a/cmd/crawler/main.go b/cmd/crawler/main.go index d9cea5a..a08fa65 100644 --- a/cmd/crawler/main.go +++ b/cmd/crawler/main.go @@ -6,6 +6,7 @@ import ( "database/sql" "log" "os" + "sync" _ "github.com/lib/pq" ) @@ -23,10 +24,20 @@ func main() { // collect websites coll := collectors.Collector{ - Responses: &database.ResponseModel{DB: db}, - Articles: &database.ArticleModel{DB: db}, + Articles: &database.ArticleModel{DB: db}, } - coll.SpiegelCollect() - //coll.Zeit() + var wg sync.WaitGroup + wg.Add(2) + + go func() { + defer wg.Done() + coll.CollectSpiegel() + }() + go func() { + defer wg.Done() + go coll.CollectZeit() + }() + + wg.Wait() } diff --git a/internal/model/database/responeses.go b/internal/model/database/responeses.go deleted file mode 100644 index b024918..0000000 --- a/internal/model/database/responeses.go +++ /dev/null @@ -1,134 +0,0 @@ -package database - -import ( - "crowsnest/internal/model" - "database/sql" -) - -type ResponseModel struct { - DB *sql.DB -} - -// Get all the response object from the database. May throw an error if the -// connection to the database fails. -func (m *ResponseModel) All() ([]model.Response, error) { - stmt := ` - SELECT url, content, fetchDate, processed - FROM responses - ` - rows, err := m.DB.Query(stmt) - if err != nil { - return nil, err - } - - responses := []model.Response{} - for rows.Next() { - r := model.Response{} - err := rows.Scan(&r.Url, &r.Content, &r.FetchDate, &r.Processed) - if err != nil { - return nil, err - } - - responses = append(responses, r) - } - - if err = rows.Err(); err != nil { - return nil, err - } - - return responses, nil -} - -// Gets all those Response objects where the processed column is set to false. -// May throw an error if the connection to the database fails. -func (m *ResponseModel) UnprocessedUrls() ([]string, error) { - stmt := ` - SELECT url - FROM responses - WHERE NOT processed - ` - rows, err := m.DB.Query(stmt) - if err != nil { - return nil, err - } - - urls := make([]string, 0) - for rows.Next() { - r := "" - err := rows.Scan(&r) - if err != nil { - return nil, err - } - - urls = append(urls, r) - } - - if err = rows.Err(); err != nil { - return nil, err - } - - return urls, nil -} - -// Checks if a given url exits in the database as a response. This may throw an -// error if the connection to the database fails. -func (m *ResponseModel) UrlExists(url string) (bool, error) { - stmt := ` - SELECT count(url) > 0 FROM responses WHERE url = $1 - ` - - var result bool - row := m.DB.QueryRow(stmt, url) - err := row.Scan(&result) - - return result, err -} - -// Gets a certain Response object by the unique url. This may throw an error if -// there does not exist an response with the given url or the connection to the -// database fails. -func (m *ResponseModel) GetByUrl(url string) (model.Response, error) { - stmt := ` - SELECT url, content, fetchDate, processed - FROM responses - WHERE url = $1 - ` - - res := model.Response{} - row := m.DB.QueryRow(stmt, url) - err := row.Scan(&res.Url, &res.Content, &res.FetchDate, &res.Processed) - - return res, err -} - -// Inserts a new response object into the database given the url and response -// body. This may fail on an unique contraint of the url or the connection to -// the database. -func (m *ResponseModel) Insert(url string, content []byte) error { - // insert response - stmt := `INSERT INTO responses (url, content) VALUES ($1, $2)` - _, err := m.DB.Exec(stmt, url, content) - - return err -} - -// Updates a response object in the database using the url of the given response -// object as the id. This may throw an error if connection to the database -// fails. -func (m *ResponseModel) Update(res *model.Response) error { - // insert response - stmt := `UPDATE responses SET content = $1, fetchDate = $2, processed = $3 WHERE url = $4` - _, err := m.DB.Exec(stmt, res.Content, res.FetchDate, res.Processed, res.Url) - - return err -} - -// Sets the processed column of a certain response entry in the database, given -// an url, to true. This may fail if the connection to the database fails. -func (m *ResponseModel) Processed(url string) error { - // insert response - stmt := `UPDATE responses SET processed = true WHERE url = $1` - _, err := m.DB.Exec(stmt, url) - - return err -} diff --git a/internal/model/response.go b/internal/model/response.go deleted file mode 100644 index a4255b9..0000000 --- a/internal/model/response.go +++ /dev/null @@ -1,11 +0,0 @@ -package model - -import "time" - -// A simple cache for requests. -type Response struct { - Url string - Content []byte - Processed bool - FetchDate time.Time -}