move from file storage to sqlite3

This commit is contained in:
2025-01-03 01:00:06 +01:00
parent fbca771479
commit 98655fd1fb
16 changed files with 150 additions and 389 deletions

View File

@@ -0,0 +1,17 @@
-- +goose Up
-- +goose StatementBegin
CREATE TABLE articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title VARCHAR(255) NOT NULL,
sourceUrl VARCHAR(255) NOT NULL UNIQUE,
author VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
publishDate DATETIME NOT NULL,
fetchDate DATETIME NOT NULL
);
-- +goose StatementEnd
-- +goose Down
-- +goose StatementBegin
DROP TABLE articles;
-- +goose StatementEnd

View File

@@ -0,0 +1,10 @@
-- +goose Up
-- +goose StatementBegin
CREATE VIRTUAL TABLE fts_articles USING fts5(id, content);
INSERT INTO fts_articles (id, content) SELECT id, title || '\n' || author || '\n' || content FROM articles;
-- +goose StatementEnd
-- +goose Down
-- +goose StatementBegin
DROP TABLE fts_articles;
-- +goose StatementEnd

View File

@@ -2,13 +2,16 @@ package main
import ( import (
"crowsnest/internal/model" "crowsnest/internal/model"
"crowsnest/internal/data" "crowsnest/internal/model/sqlite"
"database/sql"
"fmt" "fmt"
"log"
"regexp" "regexp"
"strings" "strings"
"time" "time"
"github.com/gocolly/colly/v2" "github.com/gocolly/colly/v2"
_ "github.com/mattn/go-sqlite3"
) )
func spiegelCollector(results *map[string]*model.Article) *colly.Collector { func spiegelCollector(results *map[string]*model.Article) *colly.Collector {
@@ -21,7 +24,6 @@ func spiegelCollector(results *map[string]*model.Article) *colly.Collector {
// create entry if not behind paywall // create entry if not behind paywall
paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false")
c.OnResponse(func(r *colly.Response) { c.OnResponse(func(r *colly.Response) {
if paywall_false_pattern.Match(r.Body) { if paywall_false_pattern.Match(r.Body) {
url := r.Request.URL.String() url := r.Request.URL.String()
(*results)[url] = &model.Article{ (*results)[url] = &model.Article{
@@ -86,7 +88,6 @@ func spiegelCollector(results *map[string]*model.Article) *colly.Collector {
// cascade // cascade
c.OnHTML("a[href]", func(e *colly.HTMLElement) { c.OnHTML("a[href]", func(e *colly.HTMLElement) {
fmt.Println(e.Attr("href"))
e.Request.Visit(e.Attr("href")) e.Request.Visit(e.Attr("href"))
}) })
@@ -99,14 +100,15 @@ func main() {
c.Visit("https://www.spiegel.de/") c.Visit("https://www.spiegel.de/")
// data store db, err := sql.Open("sqlite3", "./persistence/app.db")
fds, _ := data.NewFileDatastore("./persistence/spiegel.json") if err != nil { log.Fatal(err) }
repo, _ := data.NewDefaultRepository[*model.Article](fds, "article")
db_articles := &sqlite.ArticleModel{ DB: db }
counter := 0 counter := 0
for _, val := range res { for _, val := range res {
counter++ counter++
repo.Create(val) db_articles.Insert(val)
} }
fmt.Println(counter) fmt.Println(counter)
} }

View File

@@ -1,27 +1,16 @@
package main package main
import ( import (
"crowsnest/internal/data"
"crowsnest/internal/model" "crowsnest/internal/model"
"html/template" "html/template"
"net/http" "net/http"
) )
// sort criteria
type articleDateOrder struct {}
func (ord articleDateOrder) Weight(a *model.Article) int {
return int(a.PublishDate.Unix())
}
// List the latest articles using the base template. // List the latest articles using the base template.
func (app *App) Index(w http.ResponseWriter, req *http.Request) { func (app *App) Index(w http.ResponseWriter, req *http.Request) {
// retrieve from repo // get articles
fds, err := data.NewFileDatastore("persistence/spiegel100.json") articles, err := app.articles.All()
if err != nil { http.Error(w, "Failed to load datastore", http.StatusInternalServerError); return; } if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError); return; }
repo, err := data.NewDefaultRepository[*model.Article](fds, "article")
if err != nil { http.Error(w, "Failed to create repository", http.StatusInternalServerError); return; }
articles, err := repo.GetByCriteria(articleDateOrder{})
if err != nil { http.Error(w, "Failed to get articles", http.StatusInternalServerError); return; }
// truncate // truncate
if len(articles) > 10 { if len(articles) > 10 {

View File

@@ -1,58 +1,23 @@
package main package main
import ( import (
"crowsnest/internal/data"
"crowsnest/internal/model" "crowsnest/internal/model"
"html/template" "html/template"
"net/http" "net/http"
"regexp"
"strings"
) )
// sort criteria
type articleTermFrequency struct {
terms []string
}
func (ord articleTermFrequency) Weight(a *model.Article) int {
score := 0
for _, term := range ord.terms {
score += strings.Count(a.Content, term)
}
return score
}
// Enpoint that returns a list of articles given search terms in the post // Enpoint that returns a list of articles given search terms in the post
// request of a search form. Uses the content template. // request of a search form. Uses the content template.
func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) { func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) {
// parse the form data searchTerms := req.FormValue("search")
err := req.ParseForm() if searchTerms == "" {
if err != nil { http.Error(w, "Unable to parse form", http.StatusBadRequest); return; }
// collect search terms
p := regexp.MustCompile("\\s+")
searchTerms := make([]string, 0)
for _, elem := range p.Split(req.FormValue("search"), -1) {
elem = strings.TrimSpace(elem)
if elem == "" { continue }
searchTerms = append(searchTerms, elem)
}
// retrieve from repo
fds, _ := data.NewFileDatastore("persistence/spiegel100.json")
if err != nil { http.Error(w, "Failed to read datastore", http.StatusInternalServerError); return; }
repo, _ := data.NewDefaultRepository[*model.Article](fds, "article")
if err != nil { http.Error(w, "Failed to create repository", http.StatusInternalServerError); return; }
var articles []*model.Article
if len(searchTerms) != 0 {
articles, err = repo.GetByCriteria(articleTermFrequency{ terms: searchTerms })
} else {
app.Index(w, req) app.Index(w, req)
return return
} }
if err != nil { http.Error(w, "Failed to get articles from repo", http.StatusInternalServerError); return; }
// get articles
articles, err := app.articles.Search(searchTerms)
if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError); return; }
// convert to viewmodel // convert to viewmodel
articleVMs := make([]*model.ArticleViewModel, 0, len(articles)) articleVMs := make([]*model.ArticleViewModel, 0, len(articles))

View File

@@ -1,21 +1,32 @@
package main package main
import ( import (
"crowsnest/internal/model/sqlite"
"database/sql"
"log" "log"
"net/http" "net/http"
_ "github.com/mattn/go-sqlite3"
) )
type App struct {} type App struct {
articles *sqlite.ArticleModel
}
func main() { func main() {
app := &App{} db, err := sql.Open("sqlite3", "./persistence/app.db")
if err != nil { log.Fatal(err) }
app := &App{
articles: &sqlite.ArticleModel{ DB: db },
}
server := http.Server{ server := http.Server{
Addr: ":8080", Addr: ":8080",
Handler: app.routes(), Handler: app.routes(),
} }
server.ListenAndServe()
log.Println("server started, listening on :8080") log.Println("server started, listening on :8080")
server.ListenAndServe()
} }

1
go.mod
View File

@@ -13,6 +13,7 @@ require (
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
github.com/golang/protobuf v1.4.2 // indirect github.com/golang/protobuf v1.4.2 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/mattn/go-sqlite3 v1.14.24 // indirect
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
github.com/temoto/robotstxt v1.1.1 // indirect github.com/temoto/robotstxt v1.1.1 // indirect
golang.org/x/net v0.0.0-20200602114024-627f9648deb9 // indirect golang.org/x/net v0.0.0-20200602114024-627f9648deb9 // indirect

2
go.sum
View File

@@ -46,6 +46,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM=
github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=

View File

@@ -1,134 +0,0 @@
package data
import (
"encoding/json"
"errors"
"slices"
"strings"
)
// Default implementation of IRepository using an IDatastore.
type DefaultRepository[T IIdentifiable] struct {
ds IDatastore
prefix string
}
// Creates a new DefaultRepository for a generic type T given a IDatastore and
// a prefix. The type T must implement the IIdentifiable interface. The prefix
// will be used for the key for every entry using the created repository. The
// prefix should be not yet be in use in the given datastore and not contain a
// ':' character.
func NewDefaultRepository[T IIdentifiable](ds IDatastore, prefix string) (*DefaultRepository[T], error) {
if strings.Contains(prefix, ":") { return nil, errors.New("prefix should not contain ':'") }
return &DefaultRepository[T]{ ds: ds, prefix: prefix }, nil
}
// Creates a new entry in the repository with the given object t. Throws an
// error if there already exists an entry with the same id, the json encoding
// fails or the connection to the IDatastore fails.
func (repo *DefaultRepository[T]) Create(t T) error {
key := repo.prefix + ":" + t.Id()
exists, err := repo.ds.KeyExists(key)
if err != nil { return err }
if exists { return errors.New("entry with given id already exists") }
d, err := json.Marshal(t)
if err != nil { return err }
err = repo.ds.Set(key, string(d))
if err != nil { return err }
return nil
}
// Updates the entry with the same id as t in the repository with the values of
// t. Trows an error if the json encoding fails or the connection to the
// IDatastore fails.
func (repo *DefaultRepository[T]) Update(t T) error {
key := repo.prefix + ":" + t.Id()
exists, err := repo.ds.KeyExists(key)
if err != nil { return err }
if !exists { return errors.New("no entry with given id") }
d, err := json.Marshal(t)
if err != nil { return err }
err = repo.ds.Set(key, string(d))
if err != nil { return err }
return nil
}
// Delete the entry with the same id as t in the repository. Trows an error if
// the connection to the IDatastore fails or the key of t does not exist.
func (repo *DefaultRepository[T]) Delete(t T) error {
key := repo.prefix + ":" + t.Id()
exists, err := repo.ds.KeyExists(key)
if err != nil { return err }
if !exists { return errors.New("no entry with given id") }
err = repo.ds.Delete(key)
if err != nil { return err }
return nil
}
// Get all the objects of type T from the repository as a list. Trows an error
// if the connection to the IDatastore fails.
func (repo *DefaultRepository[T]) GetAll() ([]T, error) {
out := make([]T, 0)
allkeys, err := repo.ds.GetAllKeys()
if err != nil { return nil, err }
for key, _ := range allkeys {
splitkey := strings.Split(key, ":")
if splitkey[0] == repo.prefix {
// retrieve the object
obj, err := repo.GetById(splitkey[1])
if err != nil { return nil, err }
out = append(out, obj)
}
}
return out, nil
}
// Get the objects of type T from the repository that has the given id. Trows an error
// if the connection to the IDatastore or the decoding process fails.
func (repo *DefaultRepository[T]) GetById(id string) (T, error) {
var obj T
key := repo.prefix + ":" + id
value, err := repo.ds.Get(key)
if err != nil { return obj, err }
err = json.Unmarshal([]byte(value), &obj)
if err != nil { return obj, err }
return obj, nil
}
// Returns a slice of all elememts in the repo that have a
// ISearchCriteria.Weight greater than 0 sort by that weight. Throws an error
// when the elememts cannot be retrieved from the repo.
func (repo *DefaultRepository[T]) GetByCriteria(c ISortCriteria[T]) ([]T, error) {
all, err := repo.GetAll()
if err != nil { return nil, err }
filtered := make([]T, 0)
for _, elem := range all {
if c.Weight(elem) > 0 { filtered = append(filtered, elem) }
}
slices.SortFunc(filtered, func(a, b T) int {
wa, wb := c.Weight(a), c.Weight(b)
if wa > wb { return -1 }
return 1
})
return filtered, nil
}

View File

@@ -1,139 +0,0 @@
package data
import (
"os"
"maps"
"errors"
"encoding/json"
)
// A very simple datastructure, implementing the IDatastore interface. It uses
// a simple text file to the data as json.
type FileDatastore struct {
path string
data map[string]string
}
// Creates a new FileDatastore object, creating the storage file in the
// process.
func NewFileDatastore(path string) (*FileDatastore, error) {
fds := &FileDatastore{ path: path }
if _, err := fds.readMapObj(); err != nil {
if err := fds.writeMapObj(make(map[string]string)); err != nil { return nil, err }
}
return fds, nil
}
// Read the contents of the storage file and convert to a map object. May throw
// an error, if the file does not exit or the file content can not be
// converted.
func (fds *FileDatastore) readMapObj() (map[string]string, error) {
if fds.data != nil {
return fds.data, nil
}
dat, err := os.ReadFile(fds.path)
if err != nil { return nil, err }
var mapobj map[string]string
err = json.Unmarshal(dat, &mapobj)
if err != nil { return nil, err }
return mapobj, nil
}
// Write the map object to the storage file. Will overwrite the content of the
// file. May throw an error, if the file cannot be created or written to.
func (fds *FileDatastore) writeMapObj(m map[string]string) error {
file, err := os.Create(fds.path)
if err != nil { return err }
defer file.Close()
encoder := json.NewEncoder(file)
if err := encoder.Encode(m); err != nil { return err }
fds.data = m
return nil
}
// --- implement IDatastore interface ---
// Sets the key value pair given, overwriting if the key already exists. May
// through an error if the file cannot be opened or the contents cannot be
// decoded correctly.
func (fds *FileDatastore) Set(key string, val string) error {
m, err := fds.readMapObj()
if err != nil { return err }
m[key] = val
err = fds.writeMapObj(m)
if err != nil { return err }
return nil
}
// Check if for the given key a entry does exit. May through an error if the
// file cannot be opened or the contents cannot be decoded correctly.
func (fds *FileDatastore) KeyExists(key string) (bool, error) {
m, err := fds.readMapObj()
if err != nil { return false, err }
_, ok := m[key]
return ok, nil
}
// Gets the value for the given key. May through an error if the key does not
// exit, the file cannot be opened or the contents cannot be decoded
// correctly.
func (fds *FileDatastore) Get(key string) (string, error) {
m, err := fds.readMapObj()
if err != nil { return "", err }
val, ok := m[key]
if !ok { return "", errors.New("key not found") }
return val, nil
}
// Gets all the key value pairs from the file and returns them as a map object.
// May through an error if the file cannot be opened or the contents cannot be
// decoded correctly.
func (fds *FileDatastore) GetAll() (map[string]string, error) {
return fds.readMapObj()
}
// Gets all the key the file and returns them as a map object. May through an
// error if the file cannot be opened or the contents cannot be decoded
// correctly.
func (fds *FileDatastore) GetAllKeys() (map[string]bool, error) {
m, err := fds.readMapObj()
if err != nil { return nil, err }
out := make(map[string]bool)
for key := range maps.Keys(m) {
out[key] = true
}
return out, nil
}
// Deletes the entry with the given key. May through an error if the file
// cannot be opened or the contents cannot be decoded or encoded correctly.
func (fds *FileDatastore) Delete(key string) error {
m, err := fds.readMapObj()
if err != nil { return err }
delete(m, key)
err = fds.writeMapObj(m)
if err != nil { return err }
return nil
}

View File

@@ -1,12 +0,0 @@
package data
// Defines the first layer of abstraction on the interface to a persistent data
// store. This may be a file or database.
type IDatastore interface {
Set(key string, val string) error
KeyExists(key string) (bool, error)
Get(key string) (string, error)
GetAll() (map[string]string, error)
GetAllKeys() (map[string]bool, error)
Delete(key string) error
}

View File

@@ -1,7 +0,0 @@
package data
// Defines an Id function that uniquely identifies an object. This may be used
// as a primary key in a database/ datastore.
type IIdentifiable interface {
Id() string // not allowed to contain a ':'
}

View File

@@ -1,13 +0,0 @@
package data
// An interface to manage generic structure objects persistently. Should use an
// IDatastore as the interface that actually stores and retrieves the data from
// an external source.
type IRepository[T IIdentifiable] interface {
Create(t T) error
Update(t T) error
Delete(t T) error
GetAll() ([]T, error)
GetById(id string) (T, error)
GetByCriteria(c ISortCriteria[T]) ([]T, error)
}

View File

@@ -1,7 +0,0 @@
package data
// Defines a Weight function that determines a order on type T. As an example
// this may be used to sort article by date or filter for a search term.
type ISortCriteria[T any] interface {
Weight(t T) int
}

View File

@@ -2,13 +2,12 @@ package model
import ( import (
"time" "time"
"crypto/sha256"
"encoding/hex"
) )
// TODO docstring // TODO docstring
type Article struct { type Article struct {
Identifier int
SourceUrl string SourceUrl string
PublishDate time.Time PublishDate time.Time
FetchDate time.Time FetchDate time.Time
@@ -42,13 +41,3 @@ func (a *Article) ViewModel() *ArticleViewModel {
Summary: summary, Summary: summary,
} }
} }
// --- implement IIdentifiable interface ---
// Generates a hash based on the source url of the article. Can be used to
// identify the article.
func (article *Article) Id() string {
hash := sha256.Sum256([]byte(article.SourceUrl))
return hex.EncodeToString(hash[:])
}

View File

@@ -0,0 +1,87 @@
package sqlite
import (
"crowsnest/internal/model"
"database/sql"
)
// TODO docstring
type ArticleModel struct {
DB *sql.DB
}
// TODO docstring
func (m *ArticleModel) All() ([]model.Article, error) {
stmt := `
SELECT id, title, sourceUrl, author, content, publishDate, fetchDate
FROM articles
ORDER BY publishDate DESC
`
rows, err := m.DB.Query(stmt)
if err != nil { return nil, err }
articles := []model.Article{}
for rows.Next() {
a := model.Article{}
err := rows.Scan(&a.Identifier, &a.Title, &a.SourceUrl, &a.Author, &a.Content, &a.PublishDate, &a.FetchDate)
if err != nil { return nil, err }
articles = append(articles, a)
}
if err = rows.Err(); err != nil { return nil, err }
return articles, nil
}
// TODO docstring
func (m *ArticleModel) Search(query string) ([]model.Article, error) {
stmt := `
SELECT id, title, sourceUrl, author, content, publishDate, fetchDate
FROM articles JOIN (
SELECT id as id2, rank FROM fts_articles WHERE content MATCH ?
) ON id = id2
ORDER BY rank ASC, publishDate DESC
LIMIT 10
`
rows, err := m.DB.Query(stmt, query)
if err != nil { return nil, err }
articles := []model.Article{}
for rows.Next() {
a := model.Article{}
err := rows.Scan(&a.Identifier, &a.Title, &a.SourceUrl, &a.Author, &a.Content, &a.PublishDate, &a.FetchDate)
if err != nil { return nil, err }
articles = append(articles, a)
}
if err = rows.Err(); err != nil { return nil, err }
return articles, nil
}
// Inserts a new article into the database. The id attribute of the given
// article will be ignored. May throw an error if the execution of the database
// query fails.
func (m *ArticleModel) Insert(a *model.Article) error {
// begin transaction
_, err := m.DB.Begin()
if err != nil { return err }
// insert article
stmt := `INSERT INTO articles (title, sourceUrl, author, content, publishDate, fetchDate)
VALUES (?, ?, ?, ?, ?, ?)
`
result, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate)
if err != nil { return err }
lastId, err := result.LastInsertId()
if err != nil { return err }
// insert into fts_articles for full-text search
stmt = `INSERT INTO fts_articles (id, content)
VALUES (?, ? || '\n' || ? || '\n' || ?)
`
_, err = m.DB.Exec(stmt, lastId, a.Title, a.Author, a.Content)
return err
}