remove model files for responses
This commit is contained in:
@@ -3,6 +3,5 @@ package collectors
|
|||||||
import "crowsnest/internal/model/database"
|
import "crowsnest/internal/model/database"
|
||||||
|
|
||||||
type Collector struct {
|
type Collector struct {
|
||||||
Responses *database.ResponseModel
|
|
||||||
Articles *database.ArticleModel
|
Articles *database.ArticleModel
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ func (c *Collector) CollectSpiegel() {
|
|||||||
collycollector := colly.NewCollector(
|
collycollector := colly.NewCollector(
|
||||||
colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
|
colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
|
||||||
colly.CacheDir("./persistence/spiegel_cache"),
|
colly.CacheDir("./persistence/spiegel_cache"),
|
||||||
colly.MaxDepth(5),
|
colly.MaxDepth(3),
|
||||||
)
|
)
|
||||||
|
|
||||||
// store articles
|
// store articles
|
||||||
@@ -42,7 +42,7 @@ func (c *Collector) CollectSpiegel() {
|
|||||||
})
|
})
|
||||||
|
|
||||||
// go through archive
|
// go through archive
|
||||||
startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC)
|
startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC)
|
||||||
currentDate := time.Now()
|
currentDate := time.Now()
|
||||||
|
|
||||||
for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) {
|
for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) {
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ func (c *Collector) CollectZeit() {
|
|||||||
collycollector := colly.NewCollector(
|
collycollector := colly.NewCollector(
|
||||||
colly.AllowedDomains("www.zeit.de", "zeit.de"),
|
colly.AllowedDomains("www.zeit.de", "zeit.de"),
|
||||||
colly.CacheDir("./persistence/zeit_cache"),
|
colly.CacheDir("./persistence/zeit_cache"),
|
||||||
colly.MaxDepth(5),
|
colly.MaxDepth(3),
|
||||||
)
|
)
|
||||||
|
|
||||||
// store articles
|
// store articles
|
||||||
@@ -44,7 +44,7 @@ func (c *Collector) CollectZeit() {
|
|||||||
})
|
})
|
||||||
|
|
||||||
// go through archive
|
// go through archive
|
||||||
startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC)
|
startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC)
|
||||||
//startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
|
//startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
|
||||||
currentDate := time.Now()
|
currentDate := time.Now()
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"database/sql"
|
"database/sql"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
|
"sync"
|
||||||
|
|
||||||
_ "github.com/lib/pq"
|
_ "github.com/lib/pq"
|
||||||
)
|
)
|
||||||
@@ -23,10 +24,20 @@ func main() {
|
|||||||
|
|
||||||
// collect websites
|
// collect websites
|
||||||
coll := collectors.Collector{
|
coll := collectors.Collector{
|
||||||
Responses: &database.ResponseModel{DB: db},
|
|
||||||
Articles: &database.ArticleModel{DB: db},
|
Articles: &database.ArticleModel{DB: db},
|
||||||
}
|
}
|
||||||
|
|
||||||
coll.SpiegelCollect()
|
var wg sync.WaitGroup
|
||||||
//coll.Zeit()
|
wg.Add(2)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
coll.CollectSpiegel()
|
||||||
|
}()
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
go coll.CollectZeit()
|
||||||
|
}()
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,134 +0,0 @@
|
|||||||
package database
|
|
||||||
|
|
||||||
import (
|
|
||||||
"crowsnest/internal/model"
|
|
||||||
"database/sql"
|
|
||||||
)
|
|
||||||
|
|
||||||
type ResponseModel struct {
|
|
||||||
DB *sql.DB
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get all the response object from the database. May throw an error if the
|
|
||||||
// connection to the database fails.
|
|
||||||
func (m *ResponseModel) All() ([]model.Response, error) {
|
|
||||||
stmt := `
|
|
||||||
SELECT url, content, fetchDate, processed
|
|
||||||
FROM responses
|
|
||||||
`
|
|
||||||
rows, err := m.DB.Query(stmt)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
responses := []model.Response{}
|
|
||||||
for rows.Next() {
|
|
||||||
r := model.Response{}
|
|
||||||
err := rows.Scan(&r.Url, &r.Content, &r.FetchDate, &r.Processed)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
responses = append(responses, r)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err = rows.Err(); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return responses, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Gets all those Response objects where the processed column is set to false.
|
|
||||||
// May throw an error if the connection to the database fails.
|
|
||||||
func (m *ResponseModel) UnprocessedUrls() ([]string, error) {
|
|
||||||
stmt := `
|
|
||||||
SELECT url
|
|
||||||
FROM responses
|
|
||||||
WHERE NOT processed
|
|
||||||
`
|
|
||||||
rows, err := m.DB.Query(stmt)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
urls := make([]string, 0)
|
|
||||||
for rows.Next() {
|
|
||||||
r := ""
|
|
||||||
err := rows.Scan(&r)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
urls = append(urls, r)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err = rows.Err(); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return urls, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Checks if a given url exits in the database as a response. This may throw an
|
|
||||||
// error if the connection to the database fails.
|
|
||||||
func (m *ResponseModel) UrlExists(url string) (bool, error) {
|
|
||||||
stmt := `
|
|
||||||
SELECT count(url) > 0 FROM responses WHERE url = $1
|
|
||||||
`
|
|
||||||
|
|
||||||
var result bool
|
|
||||||
row := m.DB.QueryRow(stmt, url)
|
|
||||||
err := row.Scan(&result)
|
|
||||||
|
|
||||||
return result, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Gets a certain Response object by the unique url. This may throw an error if
|
|
||||||
// there does not exist an response with the given url or the connection to the
|
|
||||||
// database fails.
|
|
||||||
func (m *ResponseModel) GetByUrl(url string) (model.Response, error) {
|
|
||||||
stmt := `
|
|
||||||
SELECT url, content, fetchDate, processed
|
|
||||||
FROM responses
|
|
||||||
WHERE url = $1
|
|
||||||
`
|
|
||||||
|
|
||||||
res := model.Response{}
|
|
||||||
row := m.DB.QueryRow(stmt, url)
|
|
||||||
err := row.Scan(&res.Url, &res.Content, &res.FetchDate, &res.Processed)
|
|
||||||
|
|
||||||
return res, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Inserts a new response object into the database given the url and response
|
|
||||||
// body. This may fail on an unique contraint of the url or the connection to
|
|
||||||
// the database.
|
|
||||||
func (m *ResponseModel) Insert(url string, content []byte) error {
|
|
||||||
// insert response
|
|
||||||
stmt := `INSERT INTO responses (url, content) VALUES ($1, $2)`
|
|
||||||
_, err := m.DB.Exec(stmt, url, content)
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Updates a response object in the database using the url of the given response
|
|
||||||
// object as the id. This may throw an error if connection to the database
|
|
||||||
// fails.
|
|
||||||
func (m *ResponseModel) Update(res *model.Response) error {
|
|
||||||
// insert response
|
|
||||||
stmt := `UPDATE responses SET content = $1, fetchDate = $2, processed = $3 WHERE url = $4`
|
|
||||||
_, err := m.DB.Exec(stmt, res.Content, res.FetchDate, res.Processed, res.Url)
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sets the processed column of a certain response entry in the database, given
|
|
||||||
// an url, to true. This may fail if the connection to the database fails.
|
|
||||||
func (m *ResponseModel) Processed(url string) error {
|
|
||||||
// insert response
|
|
||||||
stmt := `UPDATE responses SET processed = true WHERE url = $1`
|
|
||||||
_, err := m.DB.Exec(stmt, url)
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
package model
|
|
||||||
|
|
||||||
import "time"
|
|
||||||
|
|
||||||
// A simple cache for requests.
|
|
||||||
type Response struct {
|
|
||||||
Url string
|
|
||||||
Content []byte
|
|
||||||
Processed bool
|
|
||||||
FetchDate time.Time
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user