remove model files for responses

This commit is contained in:
2025-01-07 12:32:38 +01:00
parent ce10e1e62b
commit 321ccfe44d
6 changed files with 20 additions and 155 deletions

View File

@@ -3,6 +3,5 @@ package collectors
import "crowsnest/internal/model/database" import "crowsnest/internal/model/database"
type Collector struct { type Collector struct {
Responses *database.ResponseModel Articles *database.ArticleModel
Articles *database.ArticleModel
} }

View File

@@ -17,7 +17,7 @@ func (c *Collector) CollectSpiegel() {
collycollector := colly.NewCollector( collycollector := colly.NewCollector(
colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
colly.CacheDir("./persistence/spiegel_cache"), colly.CacheDir("./persistence/spiegel_cache"),
colly.MaxDepth(5), colly.MaxDepth(3),
) )
// store articles // store articles
@@ -42,7 +42,7 @@ func (c *Collector) CollectSpiegel() {
}) })
// go through archive // go through archive
startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC)
currentDate := time.Now() currentDate := time.Now()
for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) { for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) {

View File

@@ -19,7 +19,7 @@ func (c *Collector) CollectZeit() {
collycollector := colly.NewCollector( collycollector := colly.NewCollector(
colly.AllowedDomains("www.zeit.de", "zeit.de"), colly.AllowedDomains("www.zeit.de", "zeit.de"),
colly.CacheDir("./persistence/zeit_cache"), colly.CacheDir("./persistence/zeit_cache"),
colly.MaxDepth(5), colly.MaxDepth(3),
) )
// store articles // store articles
@@ -44,7 +44,7 @@ func (c *Collector) CollectZeit() {
}) })
// go through archive // go through archive
startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) startDate := time.Date(2020, time.January, 1, 0, 0, 0, 0, time.UTC)
//startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC) //startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC)
currentDate := time.Now() currentDate := time.Now()

View File

@@ -6,6 +6,7 @@ import (
"database/sql" "database/sql"
"log" "log"
"os" "os"
"sync"
_ "github.com/lib/pq" _ "github.com/lib/pq"
) )
@@ -23,10 +24,20 @@ func main() {
// collect websites // collect websites
coll := collectors.Collector{ coll := collectors.Collector{
Responses: &database.ResponseModel{DB: db}, Articles: &database.ArticleModel{DB: db},
Articles: &database.ArticleModel{DB: db},
} }
coll.SpiegelCollect() var wg sync.WaitGroup
//coll.Zeit() wg.Add(2)
go func() {
defer wg.Done()
coll.CollectSpiegel()
}()
go func() {
defer wg.Done()
go coll.CollectZeit()
}()
wg.Wait()
} }

View File

@@ -1,134 +0,0 @@
package database
import (
"crowsnest/internal/model"
"database/sql"
)
type ResponseModel struct {
DB *sql.DB
}
// Get all the response object from the database. May throw an error if the
// connection to the database fails.
func (m *ResponseModel) All() ([]model.Response, error) {
stmt := `
SELECT url, content, fetchDate, processed
FROM responses
`
rows, err := m.DB.Query(stmt)
if err != nil {
return nil, err
}
responses := []model.Response{}
for rows.Next() {
r := model.Response{}
err := rows.Scan(&r.Url, &r.Content, &r.FetchDate, &r.Processed)
if err != nil {
return nil, err
}
responses = append(responses, r)
}
if err = rows.Err(); err != nil {
return nil, err
}
return responses, nil
}
// Gets all those Response objects where the processed column is set to false.
// May throw an error if the connection to the database fails.
func (m *ResponseModel) UnprocessedUrls() ([]string, error) {
stmt := `
SELECT url
FROM responses
WHERE NOT processed
`
rows, err := m.DB.Query(stmt)
if err != nil {
return nil, err
}
urls := make([]string, 0)
for rows.Next() {
r := ""
err := rows.Scan(&r)
if err != nil {
return nil, err
}
urls = append(urls, r)
}
if err = rows.Err(); err != nil {
return nil, err
}
return urls, nil
}
// Checks if a given url exits in the database as a response. This may throw an
// error if the connection to the database fails.
func (m *ResponseModel) UrlExists(url string) (bool, error) {
stmt := `
SELECT count(url) > 0 FROM responses WHERE url = $1
`
var result bool
row := m.DB.QueryRow(stmt, url)
err := row.Scan(&result)
return result, err
}
// Gets a certain Response object by the unique url. This may throw an error if
// there does not exist an response with the given url or the connection to the
// database fails.
func (m *ResponseModel) GetByUrl(url string) (model.Response, error) {
stmt := `
SELECT url, content, fetchDate, processed
FROM responses
WHERE url = $1
`
res := model.Response{}
row := m.DB.QueryRow(stmt, url)
err := row.Scan(&res.Url, &res.Content, &res.FetchDate, &res.Processed)
return res, err
}
// Inserts a new response object into the database given the url and response
// body. This may fail on an unique contraint of the url or the connection to
// the database.
func (m *ResponseModel) Insert(url string, content []byte) error {
// insert response
stmt := `INSERT INTO responses (url, content) VALUES ($1, $2)`
_, err := m.DB.Exec(stmt, url, content)
return err
}
// Updates a response object in the database using the url of the given response
// object as the id. This may throw an error if connection to the database
// fails.
func (m *ResponseModel) Update(res *model.Response) error {
// insert response
stmt := `UPDATE responses SET content = $1, fetchDate = $2, processed = $3 WHERE url = $4`
_, err := m.DB.Exec(stmt, res.Content, res.FetchDate, res.Processed, res.Url)
return err
}
// Sets the processed column of a certain response entry in the database, given
// an url, to true. This may fail if the connection to the database fails.
func (m *ResponseModel) Processed(url string) error {
// insert response
stmt := `UPDATE responses SET processed = true WHERE url = $1`
_, err := m.DB.Exec(stmt, url)
return err
}

View File

@@ -1,11 +0,0 @@
package model
import "time"
// A simple cache for requests.
type Response struct {
Url string
Content []byte
Processed bool
FetchDate time.Time
}