diff --git a/Makefile b/Makefile index f99b123..e826eda 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ serv: - DB_DRIVER="sqlite3" DB_URL="./persistence/app.db" go run -tags='sqlite_fts5' cmd/frontend/* + DB_DRIVER="postgres" DB_URL="user=crow password=4LlKpnQ2RZPzL13BSpkW4k dbname=crowsnest host=192.168.0.2 port=5432 sslmode=disable" go run cmd/frontend/* crawl: - go run -tags='sqlite_fts5' cmd/crawler/main.go + DB_DRIVER="postgres" DB_URL="user=crow password=4LlKpnQ2RZPzL13BSpkW4k dbname=crowsnest host=192.168.0.2 port=5432 sslmode=disable" go run cmd/crawler/main.go diff --git a/assets/migrations/postgresql/20250102152758_article.sql b/assets/migrations/postgresql/20250102152758_article.sql new file mode 100644 index 0000000..b85e21e --- /dev/null +++ b/assets/migrations/postgresql/20250102152758_article.sql @@ -0,0 +1,17 @@ +-- +goose Up +-- +goose StatementBegin +CREATE TABLE articles ( + id SERIAL PRIMARY KEY, + title VARCHAR(255) NOT NULL, + sourceUrl VARCHAR(255) NOT NULL UNIQUE, + author VARCHAR(255) NOT NULL, + content TEXT NOT NULL, + publishDate TIMESTAMP NOT NULL, + fetchDate TIMESTAMP NOT NULL +); +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE IF EXISTS articles; +-- +goose StatementEnd diff --git a/assets/migrations/postgresql/20250102232127_article_fts.sql b/assets/migrations/postgresql/20250102232127_article_fts.sql new file mode 100644 index 0000000..dfecf09 --- /dev/null +++ b/assets/migrations/postgresql/20250102232127_article_fts.sql @@ -0,0 +1,15 @@ +-- +goose Up +-- +goose StatementBegin +ALTER TABLE articles +ADD COLUMN fts_vector tsvector GENERATED ALWAYS AS ( + to_tsvector('german', coalesce(title, '') || ' ' || coalesce(content, '') || ' ' || coalesce(author, '')) +) STORED; + +CREATE INDEX articles_fts_idx ON articles USING gin(fts_vector); +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP INDEX IF EXISTS articles_fts_idx; +ALTER TABLE articles DROP COLUMN IF EXISTS fts_vector; +-- +goose StatementEnd diff --git a/assets/migrations/postgresql/20250103140557_response_cache.sql b/assets/migrations/postgresql/20250103140557_response_cache.sql new file mode 100644 index 0000000..7d3ef04 --- /dev/null +++ b/assets/migrations/postgresql/20250103140557_response_cache.sql @@ -0,0 +1,13 @@ +-- +goose Up +-- +goose StatementBegin +CREATE TABLE responses ( + url VARCHAR(255) NOT NULL UNIQUE PRIMARY KEY, + content BYTEA NOT NULL, + fetchDate TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE IF EXISTS responses; +-- +goose StatementEnd diff --git a/assets/migrations/postgresql/20250103203320_response_new_column.sql b/assets/migrations/postgresql/20250103203320_response_new_column.sql new file mode 100644 index 0000000..3c95161 --- /dev/null +++ b/assets/migrations/postgresql/20250103203320_response_new_column.sql @@ -0,0 +1,9 @@ +-- +goose Up +-- +goose StatementBegin +ALTER TABLE responses ADD COLUMN processed BOOLEAN DEFAULT false; +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +ALTER TABLE responses DROP COLUMN IF EXISTS processed; +-- +goose StatementEnd diff --git a/assets/migrations/20250102152758_article.sql b/assets/migrations/sqlite3/20250102152758_article.sql similarity index 100% rename from assets/migrations/20250102152758_article.sql rename to assets/migrations/sqlite3/20250102152758_article.sql diff --git a/assets/migrations/20250102232127_article_fts.sql b/assets/migrations/sqlite3/20250102232127_article_fts.sql similarity index 100% rename from assets/migrations/20250102232127_article_fts.sql rename to assets/migrations/sqlite3/20250102232127_article_fts.sql diff --git a/assets/migrations/20250103140557_response_cache.sql b/assets/migrations/sqlite3/20250103140557_response_cache.sql similarity index 100% rename from assets/migrations/20250103140557_response_cache.sql rename to assets/migrations/sqlite3/20250103140557_response_cache.sql diff --git a/assets/migrations/20250103203320_response_new_column.sql b/assets/migrations/sqlite3/20250103203320_response_new_column.sql similarity index 100% rename from assets/migrations/20250103203320_response_new_column.sql rename to assets/migrations/sqlite3/20250103203320_response_new_column.sql diff --git a/cmd/crawler/collectors/spiegel.go b/cmd/crawler/collectors/spiegel.go index 9eccdfb..78ae73f 100644 --- a/cmd/crawler/collectors/spiegel.go +++ b/cmd/crawler/collectors/spiegel.go @@ -1,110 +1,59 @@ package collectors import ( - //"crowsnest/internal/model" - //"regexp" - //"time" - //"strings" - + "crowsnest/internal/model" "fmt" + "log" + "strings" "time" "github.com/gocolly/colly/v2" ) - func (c *Collector) Spiegel() { collycollector := colly.NewCollector( colly.AllowedDomains("www.spiegel.de", "spiegel.de"), colly.CacheDir("./persistence/spiegel_cache"), - colly.MaxDepth(3), + colly.MaxDepth(2), ) + // cache + collycollector.OnRequest(func(r *colly.Request) { + url := r.URL.String() + exists, err := c.Responses.UrlExists(url) + if err == nil && !exists { + c.Responses.Insert(url, nil) + log.Println("request", url) + } else { + r.Abort() + } + }) + collycollector.OnResponse(func(r *colly.Response) { + url := r.Request.URL.String() + c.Responses.Update(&model.Response{Url: url, Content: r.Body, FetchDate: time.Now(), Processed: false}) + log.Println("response cached", url) + }) + // cascade collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { - e.Request.Visit(e.Attr("href")) + url := e.Attr("href") + log.Println("found", url) + + if !strings.HasPrefix(url, "http") { + return + } + log.Println("visiting", url) + e.Request.Visit(url) }) - // cache - collycollector.OnScraped(func(r *colly.Response) { - c.Responses.Insert(r.Request.URL.String(), string(r.Body)) - }) - - // go through archive - startDate := time.Date(2000, time.January, 1, 0, 0, 0, 0, time.UTC) + // go through archive + startDate := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) currentDate := time.Now() for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) { urlDate := date.Format("02.01.2006") url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate) - - collycollector.Visit(url) + + collycollector.Visit(url) } - - //// create entry if not behind paywall - //paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") - //collycollector.OnResponse(func(r *colly.Response) { - // if paywall_false_pattern.Match(r.Body) { - // url := r.Request.URL.String() - // (*results)[url] = &model.Article{ - // SourceUrl: url, - // FetchDate: time.Now(), - // Content: "", - // } - // } - - //}) - - //// check for article type - //collycollector.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) { - // if e.Attr("content") != "article" { - // delete(*results, e.Request.URL.String()) - // } - //}) - - //// add title - //collycollector.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) { - // if val, ok := (*results)[e.Request.URL.String()]; ok { - // val.Title = e.Attr("content") - // } - //}) - - //// prepend description to content of article - //collycollector.OnHTML("meta[name='description']", func(e *colly.HTMLElement) { - // if val, ok := (*results)[e.Request.URL.String()]; ok { - // val.Content = e.Attr("content") + val.Content - // } - //}) - - //// add publishing date - //collycollector.OnHTML("meta[name='date']", func(e *colly.HTMLElement) { - // if val, ok := (*results)[e.Request.URL.String()]; ok { - // t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content")) - // if err != nil { - // panic(err) - // } - // val.PublishDate = t - // } - //}) - - //// add author - //collycollector.OnHTML("meta[name='author']", func(e *colly.HTMLElement) { - // if val, ok := (*results)[e.Request.URL.String()]; ok { - // val.Author = e.Attr("content") - // } - //}) - - //// add content - //collycollector.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) { - // if val, ok := (*results)[e.Request.URL.String()]; ok { - // cont := val.Content - - // pattern := regexp.MustCompile("\\s+") - // cont = string(pattern.ReplaceAll([]byte(cont), []byte(" "))) - // cont = strings.ReplaceAll(cont, "»", "\"") - // cont = strings.ReplaceAll(cont, "«", "\"") - // val.Content = cont + " " + e.Text - // } - //}) - } diff --git a/cmd/crawler/collectors/zeit.go b/cmd/crawler/collectors/zeit.go index 7b7bffb..621d9dd 100644 --- a/cmd/crawler/collectors/zeit.go +++ b/cmd/crawler/collectors/zeit.go @@ -23,7 +23,7 @@ func (c *Collector) Zeit() { // cache collycollector.OnScraped(func(r *colly.Response) { - c.Responses.Insert(r.Request.URL.String(), string(r.Body)) + c.Responses.Insert(r.Request.URL.String(), r.Body) }) // go through archive diff --git a/cmd/crawler/extractors/spiegel.go b/cmd/crawler/extractors/spiegel.go index 5c2bc4f..9dcb21f 100644 --- a/cmd/crawler/extractors/spiegel.go +++ b/cmd/crawler/extractors/spiegel.go @@ -10,104 +10,125 @@ import ( "github.com/PuerkitoBio/goquery" ) - func (extractor *Extractor) Spiegel() error { - // get urls to process - urls, err := extractor.Responses.UnprocessedUrls() - if err != nil { return err } + // get urls to process + urls, err := extractor.Responses.UnprocessedUrls() + if err != nil { + return err + } paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") - url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*") - whitespace := regexp.MustCompile("\\s+") + url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*") + whitespace := regexp.MustCompile("\\s+") - var exists bool - var pagetype, title, content, datestr, author string - var tag *goquery.Selection - var date time.Time + var exists bool + var pagetype, title, content, datestr, author string + var tag *goquery.Selection + var date time.Time - for _, url := range urls { - // check url url pattern - if !url_pattern.Match([]byte(url)) { continue } + for _, url := range urls { + // check url url pattern + if !url_pattern.Match([]byte(url)) { + continue + } - // get response - res, err := extractor.Responses.GetByUrl(url) - if err != nil { - log.Println("failed to process url", url, "with", err) - continue - } - - // check for paywall - if !paywall_false_pattern.Match([]byte(res.Content)) { - extractor.Responses.Processed(url) - continue - } - - // construct goquery doc - doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content)) - if err != nil { - log.Println("failed to process url", url, "with", err) - continue - } - - // check for article type - tag = doc.Find("meta[property='og:type']") - pagetype, exists = tag.Attr("content") - if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; } + // get response + res, err := extractor.Responses.GetByUrl(url) + if err != nil { + log.Println("failed to process url", url, "with", err) + continue + } - // get title - tag = doc.Find("meta[property='og:title']") - title, exists = tag.Attr("content") - if !exists { extractor.Responses.Processed(url); continue; } + // check for paywall + if !paywall_false_pattern.Match([]byte(res.Content)) { + extractor.Responses.Processed(url) + continue + } - // prepend description to content of article - tag = doc.Find("meta[name='description']") - content, exists = tag.Attr("content") - content += " " - if !exists { extractor.Responses.Processed(url); continue; } + // construct goquery doc + doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(res.Content))) + if err != nil { + log.Println("failed to process url", url, "with", err) + continue + } - // get publishing date - tag = doc.Find("meta[name='date']") - datestr, exists = tag.Attr("content") - if !exists { extractor.Responses.Processed(url); continue; } + // check for article type + tag = doc.Find("meta[property='og:type']") + pagetype, exists = tag.Attr("content") + if !exists || pagetype != "article" { + extractor.Responses.Processed(url) + continue + } + + // get title + tag = doc.Find("meta[property='og:title']") + title, exists = tag.Attr("content") + if !exists { + extractor.Responses.Processed(url) + continue + } + + // prepend description to content of article + tag = doc.Find("meta[name='description']") + content, exists = tag.Attr("content") + content += " " + if !exists { + extractor.Responses.Processed(url) + continue + } + + // get publishing date + tag = doc.Find("meta[name='date']") + datestr, exists = tag.Attr("content") + if !exists { + extractor.Responses.Processed(url) + continue + } date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) - if err != nil { extractor.Responses.Processed(url); continue; } + if err != nil { + extractor.Responses.Processed(url) + continue + } - // get author - tag = doc.Find("meta[name='author']") - author, exists = tag.Attr("content") - if !exists { extractor.Responses.Processed(url); continue; } + // get author + tag = doc.Find("meta[name='author']") + author, exists = tag.Attr("content") + if !exists { + extractor.Responses.Processed(url) + continue + } - // get content - tag = doc.Find("main[id='Inhalt'] div > p") + // get content + tag = doc.Find("main[id='Inhalt'] div > p") - tag.Each(func(index int, p *goquery.Selection) { - content += " " + p.Text() - }) - - // clean up content string - content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) - content = strings.ReplaceAll(content, "»", "\"") - content = strings.ReplaceAll(content, "«", "\"") - - // insert new article - article := model.Article{ - SourceUrl: url, - PublishDate: date, - FetchDate: res.FetchDate, - Title: title, - Content: content, - Author: author, - } - - err = extractor.Articles.Insert(&article) - if err != nil { - log.Println("failed to insert", article) - } else { - extractor.Responses.Processed(url) - log.Println("found article at", url) - } - } + tag.Each(func(index int, p *goquery.Selection) { + content += " " + p.Text() + }) - return nil + // clean up content string + content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) + content = strings.ReplaceAll(content, "»", "\"") + content = strings.ReplaceAll(content, "«", "\"") + + // insert new article + article := model.Article{ + SourceUrl: url, + PublishDate: date, + FetchDate: res.FetchDate, + Title: title, + Content: content, + Author: author, + } + + err = extractor.Articles.Insert(&article) + if err != nil { + log.Println("failed to insert", article) + } else { + extractor.Responses.Processed(url) + log.Println("found article at", url) + } + } + + return nil } diff --git a/cmd/crawler/main.go b/cmd/crawler/main.go index 5fd1543..0acc7de 100644 --- a/cmd/crawler/main.go +++ b/cmd/crawler/main.go @@ -1,26 +1,31 @@ package main import ( - "crowsnest/cmd/crawler/collectors" "crowsnest/cmd/crawler/extractors" "crowsnest/internal/model/database" "database/sql" "log" + "os" + "time" - _ "github.com/mattn/go-sqlite3" + _ "github.com/lib/pq" ) func main() { - // open database - db, err := sql.Open("sqlite3", "./persistence/app.db") + // collect environement variables + databaseURL := os.Getenv("DB_URL") + + // connect to database + db, err := sql.Open("postgres", databaseURL) if err != nil { log.Fatal(err) } + defer db.Close() // collect websites - _ = collectors.Collector{ - Responses: &database.ResponseModel{DB: db}, - } + //coll := collectors.Collector{ + // Responses: &database.ResponseModel{DB: db}, + //} //coll.Spiegel() //coll.Zeit() @@ -31,5 +36,8 @@ func main() { Articles: &database.ArticleModel{DB: db}, } - extr.Spiegel() + for { + extr.Spiegel() + time.Sleep(5 * time.Second) + } } diff --git a/cmd/frontend/UpSearch.go b/cmd/frontend/UpSearch.go index bf00888..432df7a 100644 --- a/cmd/frontend/UpSearch.go +++ b/cmd/frontend/UpSearch.go @@ -9,29 +9,32 @@ import ( // Enpoint that returns a list of articles given search terms in the post // request of a search form. Uses the content template. func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) { - // construct search query - searchTerms := req.FormValue("search") - if searchTerms == "" { - app.Index(w, req) - return - } - - // get articles - articles, err := app.articles.Search(searchTerms) - if err != nil { - // treat as no result - //http.Error(w, err.Error(), http.StatusInternalServerError) - return - } + // construct search query + searchTerms := req.FormValue("search") + if searchTerms == "" { + app.Index(w, req) + return + } - // convert to viewmodel - articleVMs := make([]*model.ArticleViewModel, 0, len(articles)) - for _, a := range articles { - articleVMs = append(articleVMs, a.ViewModel()) - } - - // render template - t := template.Must(template.ParseFiles("assets/templates/article.html")) - err = t.ExecuteTemplate(w, "content", articleVMs) - if err != nil { http.Error(w, "Failed to render template", http.StatusInternalServerError); return; } + // get articles + articles, err := app.articles.Search(searchTerms) + if err != nil { + // treat as no result + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + // convert to viewmodel + articleVMs := make([]*model.ArticleViewModel, 0, len(articles)) + for _, a := range articles { + articleVMs = append(articleVMs, a.ViewModel()) + } + + // render template + t := template.Must(template.ParseFiles("assets/templates/article.html")) + err = t.ExecuteTemplate(w, "content", articleVMs) + if err != nil { + http.Error(w, "Failed to render template", http.StatusInternalServerError) + return + } } diff --git a/cmd/frontend/main.go b/cmd/frontend/main.go index d2ee89c..fd8211c 100644 --- a/cmd/frontend/main.go +++ b/cmd/frontend/main.go @@ -3,13 +3,11 @@ package main import ( "crowsnest/internal/model/database" "database/sql" - "errors" "log" "net/http" "os" _ "github.com/lib/pq" - _ "github.com/mattn/go-sqlite3" ) type App struct { @@ -19,24 +17,17 @@ type App struct { func main() { // collect environement variables databaseURL := os.Getenv("DB_URL") - dbDriver := os.Getenv("DB_DRIVER") // connect to database - var db *sql.DB - var err error - switch { - case dbDriver == "sqlite3": - db, err = sql.Open("sqlite3", databaseURL) - if err != nil { - log.Fatal(err) - } - default: - log.Fatal(errors.New("given DB_DRIVER is not supported")) + db, err := sql.Open("postgres", databaseURL) + if err != nil { + log.Fatal(err) } + defer db.Close() // define app app := &App{ - articles: &database.ArticleModel{DB: db, DbDriver: dbDriver}, + articles: &database.ArticleModel{DB: db}, } // start web server diff --git a/go.mod b/go.mod index ad876c0..82cd02b 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect github.com/golang/protobuf v1.4.2 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/lib/pq v1.10.9 // indirect github.com/mattn/go-sqlite3 v1.14.24 // indirect github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect github.com/temoto/robotstxt v1.1.1 // indirect diff --git a/go.sum b/go.sum index 024088f..802ac7b 100644 --- a/go.sum +++ b/go.sum @@ -51,6 +51,8 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= +github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM= github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= diff --git a/internal/model/database/articles.go b/internal/model/database/articles.go index 97a2d2d..3f2555e 100644 --- a/internal/model/database/articles.go +++ b/internal/model/database/articles.go @@ -5,13 +5,12 @@ import ( "database/sql" ) -// TODO docstring type ArticleModel struct { - DB *sql.DB - DbDriver string + DB *sql.DB } -// TODO docstring +// Gets all the article objects from the database. This may throw an error if +// the connection to the database fails. func (m *ArticleModel) All() ([]model.Article, error) { stmt := ` SELECT id, title, sourceUrl, author, content, publishDate, fetchDate @@ -41,16 +40,18 @@ func (m *ArticleModel) All() ([]model.Article, error) { return articles, nil } -// TODO docstring +// Will use the full-text search features of the underlying database to search +// articles for a given search query. This may fail if the connection to the +// database fails. func (m *ArticleModel) Search(query string) ([]model.Article, error) { stmt := ` - SELECT id, title, sourceUrl, author, content, publishDate, fetchDate - FROM articles JOIN ( - SELECT id as id2, rank FROM fts_articles WHERE content MATCH ? - ) ON id = id2 - ORDER BY rank ASC, publishDate DESC - LIMIT 10 - ` + SELECT id, title, sourceurl, author, content, publishdate, fetchDate + FROM articles + WHERE fts_vector @@ to_tsquery('german', $1) + ORDER BY ts_rank(fts_vector, to_tsquery('german', $1)) DESC + LIMIT 10 + ` + rows, err := m.DB.Query(stmt, query) if err != nil { return nil, err @@ -70,7 +71,6 @@ func (m *ArticleModel) Search(query string) ([]model.Article, error) { if err = rows.Err(); err != nil { return nil, err } - return articles, nil } @@ -78,55 +78,20 @@ func (m *ArticleModel) Search(query string) ([]model.Article, error) { // article will be ignored. May throw an error if the execution of the database // query fails. func (m *ArticleModel) Insert(a *model.Article) error { - // begin transaction - _, err := m.DB.Begin() - if err != nil { - return err - } - // insert article stmt := `INSERT INTO articles (title, sourceUrl, author, content, publishDate, fetchDate) - VALUES (?, ?, ?, ?, ?, ?) + VALUES ($1, $2, $3, $4, $5, $6) ` - result, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate) - if err != nil { - return err - } - lastId, err := result.LastInsertId() - if err != nil { - return err - } - - // insert into fts_articles for full-text search - stmt = `INSERT INTO fts_articles (id, content) - VALUES (?, ? || '\n' || ? || '\n' || ?) - ` - _, err = m.DB.Exec(stmt, lastId, a.Title, a.Author, a.Content) + _, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate) return err } // TODO docstring func (m *ArticleModel) Update(a *model.Article) error { - // begin transaction - _, err := m.DB.Begin() - if err != nil { - return err - } - - // insert article stmt := `UPDATE articles - SET title = ?, sourceUrl = ?, author = ?, content = ?, publishDate = ?, fetchDate = ? - WHERE id = ? + SET title = $1, sourceUrl = $2, author = $3, content = $4, publishDate = $5, fetchDate = $6 + WHERE id = $7 ` - _, err = m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate, a.Identifier) - if err != nil { - return err - } - - // insert into fts_articles for full-text search - stmt = `INSERT INTO fts_articles (id, content) - VALUES (?, ? || '\n' || ? || '\n' || ?) - ` - _, err = m.DB.Exec(stmt, a.Identifier, a.Title, a.Author, a.Content) + _, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate, a.Identifier) return err } diff --git a/internal/model/database/responeses.go b/internal/model/database/responeses.go index 1aa70ce..b024918 100644 --- a/internal/model/database/responeses.go +++ b/internal/model/database/responeses.go @@ -6,11 +6,11 @@ import ( ) type ResponseModel struct { - DB *sql.DB - DbDriver string + DB *sql.DB } -// TODO docstring +// Get all the response object from the database. May throw an error if the +// connection to the database fails. func (m *ResponseModel) All() ([]model.Response, error) { stmt := ` SELECT url, content, fetchDate, processed @@ -39,7 +39,8 @@ func (m *ResponseModel) All() ([]model.Response, error) { return responses, nil } -// TODO docstring +// Gets all those Response objects where the processed column is set to false. +// May throw an error if the connection to the database fails. func (m *ResponseModel) UnprocessedUrls() ([]string, error) { stmt := ` SELECT url @@ -69,12 +70,28 @@ func (m *ResponseModel) UnprocessedUrls() ([]string, error) { return urls, nil } -// TODO docstring +// Checks if a given url exits in the database as a response. This may throw an +// error if the connection to the database fails. +func (m *ResponseModel) UrlExists(url string) (bool, error) { + stmt := ` + SELECT count(url) > 0 FROM responses WHERE url = $1 + ` + + var result bool + row := m.DB.QueryRow(stmt, url) + err := row.Scan(&result) + + return result, err +} + +// Gets a certain Response object by the unique url. This may throw an error if +// there does not exist an response with the given url or the connection to the +// database fails. func (m *ResponseModel) GetByUrl(url string) (model.Response, error) { stmt := ` SELECT url, content, fetchDate, processed FROM responses - WHERE url = ? + WHERE url = $1 ` res := model.Response{} @@ -84,19 +101,33 @@ func (m *ResponseModel) GetByUrl(url string) (model.Response, error) { return res, err } -// TODO docstring -func (m *ResponseModel) Insert(url string, content string) error { +// Inserts a new response object into the database given the url and response +// body. This may fail on an unique contraint of the url or the connection to +// the database. +func (m *ResponseModel) Insert(url string, content []byte) error { // insert response - stmt := `INSERT INTO responses (url, content) VALUES (?, ?)` + stmt := `INSERT INTO responses (url, content) VALUES ($1, $2)` _, err := m.DB.Exec(stmt, url, content) return err } -// TODO docstring +// Updates a response object in the database using the url of the given response +// object as the id. This may throw an error if connection to the database +// fails. +func (m *ResponseModel) Update(res *model.Response) error { + // insert response + stmt := `UPDATE responses SET content = $1, fetchDate = $2, processed = $3 WHERE url = $4` + _, err := m.DB.Exec(stmt, res.Content, res.FetchDate, res.Processed, res.Url) + + return err +} + +// Sets the processed column of a certain response entry in the database, given +// an url, to true. This may fail if the connection to the database fails. func (m *ResponseModel) Processed(url string) error { // insert response - stmt := `UPDATE responses SET processed = true WHERE url = ?` + stmt := `UPDATE responses SET processed = true WHERE url = $1` _, err := m.DB.Exec(stmt, url) return err diff --git a/internal/model/response.go b/internal/model/response.go index 4db9b6d..a4255b9 100644 --- a/internal/model/response.go +++ b/internal/model/response.go @@ -5,7 +5,7 @@ import "time" // A simple cache for requests. type Response struct { Url string - Content string - Processed bool + Content []byte + Processed bool FetchDate time.Time }