diff --git a/assets/migrations/20250102152758_article.sql b/assets/migrations/20250102152758_article.sql new file mode 100644 index 0000000..4a907af --- /dev/null +++ b/assets/migrations/20250102152758_article.sql @@ -0,0 +1,17 @@ +-- +goose Up +-- +goose StatementBegin +CREATE TABLE articles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title VARCHAR(255) NOT NULL, + sourceUrl VARCHAR(255) NOT NULL UNIQUE, + author VARCHAR(255) NOT NULL, + content TEXT NOT NULL, + publishDate DATETIME NOT NULL, + fetchDate DATETIME NOT NULL +); +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE articles; +-- +goose StatementEnd diff --git a/assets/migrations/20250102232127_article_fts.sql b/assets/migrations/20250102232127_article_fts.sql new file mode 100644 index 0000000..28b51b7 --- /dev/null +++ b/assets/migrations/20250102232127_article_fts.sql @@ -0,0 +1,10 @@ +-- +goose Up +-- +goose StatementBegin +CREATE VIRTUAL TABLE fts_articles USING fts5(id, content); +INSERT INTO fts_articles (id, content) SELECT id, title || '\n' || author || '\n' || content FROM articles; +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE fts_articles; +-- +goose StatementEnd diff --git a/cmd/crawler/main.go b/cmd/crawler/main.go index f79feeb..5c95458 100644 --- a/cmd/crawler/main.go +++ b/cmd/crawler/main.go @@ -2,13 +2,16 @@ package main import ( "crowsnest/internal/model" - "crowsnest/internal/data" + "crowsnest/internal/model/sqlite" + "database/sql" "fmt" + "log" "regexp" "strings" "time" "github.com/gocolly/colly/v2" + _ "github.com/mattn/go-sqlite3" ) func spiegelCollector(results *map[string]*model.Article) *colly.Collector { @@ -21,7 +24,6 @@ func spiegelCollector(results *map[string]*model.Article) *colly.Collector { // create entry if not behind paywall paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") c.OnResponse(func(r *colly.Response) { - if paywall_false_pattern.Match(r.Body) { url := r.Request.URL.String() (*results)[url] = &model.Article{ @@ -86,7 +88,6 @@ func spiegelCollector(results *map[string]*model.Article) *colly.Collector { // cascade c.OnHTML("a[href]", func(e *colly.HTMLElement) { - fmt.Println(e.Attr("href")) e.Request.Visit(e.Attr("href")) }) @@ -99,14 +100,15 @@ func main() { c.Visit("https://www.spiegel.de/") - // data store - fds, _ := data.NewFileDatastore("./persistence/spiegel.json") - repo, _ := data.NewDefaultRepository[*model.Article](fds, "article") + db, err := sql.Open("sqlite3", "./persistence/app.db") + if err != nil { log.Fatal(err) } + + db_articles := &sqlite.ArticleModel{ DB: db } counter := 0 for _, val := range res { counter++ - repo.Create(val) + db_articles.Insert(val) } fmt.Println(counter) } diff --git a/cmd/frontend/Index.go b/cmd/frontend/Index.go index f34ba18..ccf46ac 100644 --- a/cmd/frontend/Index.go +++ b/cmd/frontend/Index.go @@ -1,27 +1,16 @@ package main import ( - "crowsnest/internal/data" "crowsnest/internal/model" "html/template" "net/http" ) -// sort criteria -type articleDateOrder struct {} -func (ord articleDateOrder) Weight(a *model.Article) int { - return int(a.PublishDate.Unix()) -} - // List the latest articles using the base template. func (app *App) Index(w http.ResponseWriter, req *http.Request) { - // retrieve from repo - fds, err := data.NewFileDatastore("persistence/spiegel100.json") - if err != nil { http.Error(w, "Failed to load datastore", http.StatusInternalServerError); return; } - repo, err := data.NewDefaultRepository[*model.Article](fds, "article") - if err != nil { http.Error(w, "Failed to create repository", http.StatusInternalServerError); return; } - articles, err := repo.GetByCriteria(articleDateOrder{}) - if err != nil { http.Error(w, "Failed to get articles", http.StatusInternalServerError); return; } + // get articles + articles, err := app.articles.All() + if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError); return; } // truncate if len(articles) > 10 { diff --git a/cmd/frontend/UpSearch.go b/cmd/frontend/UpSearch.go index 6d2d080..39f75bb 100644 --- a/cmd/frontend/UpSearch.go +++ b/cmd/frontend/UpSearch.go @@ -1,58 +1,23 @@ package main import ( - "crowsnest/internal/data" "crowsnest/internal/model" "html/template" "net/http" - "regexp" - "strings" ) -// sort criteria -type articleTermFrequency struct { - terms []string -} - -func (ord articleTermFrequency) Weight(a *model.Article) int { - score := 0 - for _, term := range ord.terms { - score += strings.Count(a.Content, term) - } - return score -} - // Enpoint that returns a list of articles given search terms in the post // request of a search form. Uses the content template. func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) { - // parse the form data - err := req.ParseForm() - if err != nil { http.Error(w, "Unable to parse form", http.StatusBadRequest); return; } - - // collect search terms - p := regexp.MustCompile("\\s+") - searchTerms := make([]string, 0) - for _, elem := range p.Split(req.FormValue("search"), -1) { - elem = strings.TrimSpace(elem) - if elem == "" { continue } - searchTerms = append(searchTerms, elem) - } - - // retrieve from repo - fds, _ := data.NewFileDatastore("persistence/spiegel100.json") - if err != nil { http.Error(w, "Failed to read datastore", http.StatusInternalServerError); return; } - - repo, _ := data.NewDefaultRepository[*model.Article](fds, "article") - if err != nil { http.Error(w, "Failed to create repository", http.StatusInternalServerError); return; } - - var articles []*model.Article - if len(searchTerms) != 0 { - articles, err = repo.GetByCriteria(articleTermFrequency{ terms: searchTerms }) - } else { + searchTerms := req.FormValue("search") + if searchTerms == "" { app.Index(w, req) return } - if err != nil { http.Error(w, "Failed to get articles from repo", http.StatusInternalServerError); return; } + + // get articles + articles, err := app.articles.Search(searchTerms) + if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError); return; } // convert to viewmodel articleVMs := make([]*model.ArticleViewModel, 0, len(articles)) diff --git a/cmd/frontend/main.go b/cmd/frontend/main.go index 789d33d..ce2f9c5 100644 --- a/cmd/frontend/main.go +++ b/cmd/frontend/main.go @@ -1,21 +1,32 @@ package main import ( + "crowsnest/internal/model/sqlite" + "database/sql" "log" "net/http" + + _ "github.com/mattn/go-sqlite3" ) -type App struct {} +type App struct { + articles *sqlite.ArticleModel +} func main() { - app := &App{} + db, err := sql.Open("sqlite3", "./persistence/app.db") + if err != nil { log.Fatal(err) } + + app := &App{ + articles: &sqlite.ArticleModel{ DB: db }, + } server := http.Server{ Addr: ":8080", Handler: app.routes(), } - server.ListenAndServe() log.Println("server started, listening on :8080") + server.ListenAndServe() } diff --git a/go.mod b/go.mod index 2b2ba49..a939b96 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect github.com/golang/protobuf v1.4.2 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/mattn/go-sqlite3 v1.14.24 // indirect github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect github.com/temoto/robotstxt v1.1.1 // indirect golang.org/x/net v0.0.0-20200602114024-627f9648deb9 // indirect diff --git a/go.sum b/go.sum index 50b5cb1..ba2cd68 100644 --- a/go.sum +++ b/go.sum @@ -46,6 +46,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM= +github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= diff --git a/internal/data/DefaultRepository.go b/internal/data/DefaultRepository.go deleted file mode 100644 index 9766753..0000000 --- a/internal/data/DefaultRepository.go +++ /dev/null @@ -1,134 +0,0 @@ -package data - -import ( - "encoding/json" - "errors" - "slices" - "strings" -) - -// Default implementation of IRepository using an IDatastore. -type DefaultRepository[T IIdentifiable] struct { - ds IDatastore - prefix string -} - -// Creates a new DefaultRepository for a generic type T given a IDatastore and -// a prefix. The type T must implement the IIdentifiable interface. The prefix -// will be used for the key for every entry using the created repository. The -// prefix should be not yet be in use in the given datastore and not contain a -// ':' character. -func NewDefaultRepository[T IIdentifiable](ds IDatastore, prefix string) (*DefaultRepository[T], error) { - if strings.Contains(prefix, ":") { return nil, errors.New("prefix should not contain ':'") } - return &DefaultRepository[T]{ ds: ds, prefix: prefix }, nil -} - -// Creates a new entry in the repository with the given object t. Throws an -// error if there already exists an entry with the same id, the json encoding -// fails or the connection to the IDatastore fails. -func (repo *DefaultRepository[T]) Create(t T) error { - key := repo.prefix + ":" + t.Id() - exists, err := repo.ds.KeyExists(key) - if err != nil { return err } - if exists { return errors.New("entry with given id already exists") } - - d, err := json.Marshal(t) - if err != nil { return err } - - err = repo.ds.Set(key, string(d)) - if err != nil { return err } - - return nil -} - -// Updates the entry with the same id as t in the repository with the values of -// t. Trows an error if the json encoding fails or the connection to the -// IDatastore fails. -func (repo *DefaultRepository[T]) Update(t T) error { - key := repo.prefix + ":" + t.Id() - exists, err := repo.ds.KeyExists(key) - if err != nil { return err } - if !exists { return errors.New("no entry with given id") } - - d, err := json.Marshal(t) - if err != nil { return err } - - err = repo.ds.Set(key, string(d)) - if err != nil { return err } - - return nil -} - -// Delete the entry with the same id as t in the repository. Trows an error if -// the connection to the IDatastore fails or the key of t does not exist. -func (repo *DefaultRepository[T]) Delete(t T) error { - key := repo.prefix + ":" + t.Id() - - exists, err := repo.ds.KeyExists(key) - if err != nil { return err } - if !exists { return errors.New("no entry with given id") } - - err = repo.ds.Delete(key) - if err != nil { return err } - - return nil -} - -// Get all the objects of type T from the repository as a list. Trows an error -// if the connection to the IDatastore fails. -func (repo *DefaultRepository[T]) GetAll() ([]T, error) { - out := make([]T, 0) - - allkeys, err := repo.ds.GetAllKeys() - if err != nil { return nil, err } - - for key, _ := range allkeys { - splitkey := strings.Split(key, ":") - if splitkey[0] == repo.prefix { - - // retrieve the object - obj, err := repo.GetById(splitkey[1]) - if err != nil { return nil, err } - out = append(out, obj) - - } - } - - return out, nil -} - -// Get the objects of type T from the repository that has the given id. Trows an error -// if the connection to the IDatastore or the decoding process fails. -func (repo *DefaultRepository[T]) GetById(id string) (T, error) { - var obj T - - key := repo.prefix + ":" + id - value, err := repo.ds.Get(key) - if err != nil { return obj, err } - - err = json.Unmarshal([]byte(value), &obj) - if err != nil { return obj, err } - - return obj, nil -} - -// Returns a slice of all elememts in the repo that have a -// ISearchCriteria.Weight greater than 0 sort by that weight. Throws an error -// when the elememts cannot be retrieved from the repo. -func (repo *DefaultRepository[T]) GetByCriteria(c ISortCriteria[T]) ([]T, error) { - all, err := repo.GetAll() - if err != nil { return nil, err } - - filtered := make([]T, 0) - for _, elem := range all { - if c.Weight(elem) > 0 { filtered = append(filtered, elem) } - } - - slices.SortFunc(filtered, func(a, b T) int { - wa, wb := c.Weight(a), c.Weight(b) - if wa > wb { return -1 } - return 1 - }) - - return filtered, nil -} diff --git a/internal/data/FileDatastore.go b/internal/data/FileDatastore.go deleted file mode 100644 index 612554b..0000000 --- a/internal/data/FileDatastore.go +++ /dev/null @@ -1,139 +0,0 @@ -package data - -import ( - "os" - "maps" - "errors" - "encoding/json" -) - - -// A very simple datastructure, implementing the IDatastore interface. It uses -// a simple text file to the data as json. -type FileDatastore struct { - path string - data map[string]string -} - -// Creates a new FileDatastore object, creating the storage file in the -// process. -func NewFileDatastore(path string) (*FileDatastore, error) { - fds := &FileDatastore{ path: path } - - if _, err := fds.readMapObj(); err != nil { - if err := fds.writeMapObj(make(map[string]string)); err != nil { return nil, err } - } - - return fds, nil -} - -// Read the contents of the storage file and convert to a map object. May throw -// an error, if the file does not exit or the file content can not be -// converted. -func (fds *FileDatastore) readMapObj() (map[string]string, error) { - if fds.data != nil { - return fds.data, nil - } - - dat, err := os.ReadFile(fds.path) - if err != nil { return nil, err } - - var mapobj map[string]string - err = json.Unmarshal(dat, &mapobj) - if err != nil { return nil, err } - - return mapobj, nil -} - -// Write the map object to the storage file. Will overwrite the content of the -// file. May throw an error, if the file cannot be created or written to. -func (fds *FileDatastore) writeMapObj(m map[string]string) error { - file, err := os.Create(fds.path) - if err != nil { return err } - defer file.Close() - - encoder := json.NewEncoder(file) - if err := encoder.Encode(m); err != nil { return err } - - fds.data = m - - return nil -} - - -// --- implement IDatastore interface --- - -// Sets the key value pair given, overwriting if the key already exists. May -// through an error if the file cannot be opened or the contents cannot be -// decoded correctly. -func (fds *FileDatastore) Set(key string, val string) error { - m, err := fds.readMapObj() - if err != nil { return err } - - m[key] = val - - err = fds.writeMapObj(m) - if err != nil { return err } - - return nil -} - -// Check if for the given key a entry does exit. May through an error if the -// file cannot be opened or the contents cannot be decoded correctly. -func (fds *FileDatastore) KeyExists(key string) (bool, error) { - m, err := fds.readMapObj() - if err != nil { return false, err } - - _, ok := m[key] - - return ok, nil -} - -// Gets the value for the given key. May through an error if the key does not -// exit, the file cannot be opened or the contents cannot be decoded -// correctly. -func (fds *FileDatastore) Get(key string) (string, error) { - m, err := fds.readMapObj() - if err != nil { return "", err } - - val, ok := m[key] - if !ok { return "", errors.New("key not found") } - - return val, nil -} - -// Gets all the key value pairs from the file and returns them as a map object. -// May through an error if the file cannot be opened or the contents cannot be -// decoded correctly. -func (fds *FileDatastore) GetAll() (map[string]string, error) { - return fds.readMapObj() -} - -// Gets all the key the file and returns them as a map object. May through an -// error if the file cannot be opened or the contents cannot be decoded -// correctly. -func (fds *FileDatastore) GetAllKeys() (map[string]bool, error) { - m, err := fds.readMapObj() - if err != nil { return nil, err } - - out := make(map[string]bool) - for key := range maps.Keys(m) { - out[key] = true - } - - return out, nil -} - -// Deletes the entry with the given key. May through an error if the file -// cannot be opened or the contents cannot be decoded or encoded correctly. -func (fds *FileDatastore) Delete(key string) error { - m, err := fds.readMapObj() - if err != nil { return err } - - delete(m, key) - - err = fds.writeMapObj(m) - if err != nil { return err } - - return nil -} diff --git a/internal/data/IDatastore.go b/internal/data/IDatastore.go deleted file mode 100644 index 70eaf19..0000000 --- a/internal/data/IDatastore.go +++ /dev/null @@ -1,12 +0,0 @@ -package data - -// Defines the first layer of abstraction on the interface to a persistent data -// store. This may be a file or database. -type IDatastore interface { - Set(key string, val string) error - KeyExists(key string) (bool, error) - Get(key string) (string, error) - GetAll() (map[string]string, error) - GetAllKeys() (map[string]bool, error) - Delete(key string) error -} diff --git a/internal/data/IIdentifiable.go b/internal/data/IIdentifiable.go deleted file mode 100644 index 4e76159..0000000 --- a/internal/data/IIdentifiable.go +++ /dev/null @@ -1,7 +0,0 @@ -package data - -// Defines an Id function that uniquely identifies an object. This may be used -// as a primary key in a database/ datastore. -type IIdentifiable interface { - Id() string // not allowed to contain a ':' -} diff --git a/internal/data/IRepository.go b/internal/data/IRepository.go deleted file mode 100644 index aa08b5d..0000000 --- a/internal/data/IRepository.go +++ /dev/null @@ -1,13 +0,0 @@ -package data - -// An interface to manage generic structure objects persistently. Should use an -// IDatastore as the interface that actually stores and retrieves the data from -// an external source. -type IRepository[T IIdentifiable] interface { - Create(t T) error - Update(t T) error - Delete(t T) error - GetAll() ([]T, error) - GetById(id string) (T, error) - GetByCriteria(c ISortCriteria[T]) ([]T, error) -} diff --git a/internal/data/ISearchCriteria.go b/internal/data/ISearchCriteria.go deleted file mode 100644 index bd0c246..0000000 --- a/internal/data/ISearchCriteria.go +++ /dev/null @@ -1,7 +0,0 @@ -package data - -// Defines a Weight function that determines a order on type T. As an example -// this may be used to sort article by date or filter for a search term. -type ISortCriteria[T any] interface { - Weight(t T) int -} diff --git a/internal/model/article.go b/internal/model/article.go index fcb8dde..bbb6aae 100644 --- a/internal/model/article.go +++ b/internal/model/article.go @@ -2,13 +2,12 @@ package model import ( "time" - "crypto/sha256" - "encoding/hex" ) // TODO docstring type Article struct { + Identifier int SourceUrl string PublishDate time.Time FetchDate time.Time @@ -42,13 +41,3 @@ func (a *Article) ViewModel() *ArticleViewModel { Summary: summary, } } - - -// --- implement IIdentifiable interface --- - -// Generates a hash based on the source url of the article. Can be used to -// identify the article. -func (article *Article) Id() string { - hash := sha256.Sum256([]byte(article.SourceUrl)) - return hex.EncodeToString(hash[:]) -} diff --git a/internal/model/sqlite/articles.go b/internal/model/sqlite/articles.go new file mode 100644 index 0000000..779b93b --- /dev/null +++ b/internal/model/sqlite/articles.go @@ -0,0 +1,87 @@ +package sqlite + +import ( + "crowsnest/internal/model" + "database/sql" +) + +// TODO docstring +type ArticleModel struct { + DB *sql.DB +} + +// TODO docstring +func (m *ArticleModel) All() ([]model.Article, error) { + stmt := ` + SELECT id, title, sourceUrl, author, content, publishDate, fetchDate + FROM articles + ORDER BY publishDate DESC + ` + rows, err := m.DB.Query(stmt) + if err != nil { return nil, err } + + articles := []model.Article{} + for rows.Next() { + a := model.Article{} + err := rows.Scan(&a.Identifier, &a.Title, &a.SourceUrl, &a.Author, &a.Content, &a.PublishDate, &a.FetchDate) + if err != nil { return nil, err } + + articles = append(articles, a) + } + + if err = rows.Err(); err != nil { return nil, err } + + return articles, nil +} + +// TODO docstring +func (m *ArticleModel) Search(query string) ([]model.Article, error) { + stmt := ` + SELECT id, title, sourceUrl, author, content, publishDate, fetchDate + FROM articles JOIN ( + SELECT id as id2, rank FROM fts_articles WHERE content MATCH ? + ) ON id = id2 + ORDER BY rank ASC, publishDate DESC + LIMIT 10 + ` + rows, err := m.DB.Query(stmt, query) + if err != nil { return nil, err } + + articles := []model.Article{} + for rows.Next() { + a := model.Article{} + err := rows.Scan(&a.Identifier, &a.Title, &a.SourceUrl, &a.Author, &a.Content, &a.PublishDate, &a.FetchDate) + if err != nil { return nil, err } + + articles = append(articles, a) + } + + if err = rows.Err(); err != nil { return nil, err } + + return articles, nil +} + +// Inserts a new article into the database. The id attribute of the given +// article will be ignored. May throw an error if the execution of the database +// query fails. +func (m *ArticleModel) Insert(a *model.Article) error { + // begin transaction + _, err := m.DB.Begin() + if err != nil { return err } + + // insert article + stmt := `INSERT INTO articles (title, sourceUrl, author, content, publishDate, fetchDate) + VALUES (?, ?, ?, ?, ?, ?) + ` + result, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate) + if err != nil { return err } + lastId, err := result.LastInsertId() + if err != nil { return err } + + // insert into fts_articles for full-text search + stmt = `INSERT INTO fts_articles (id, content) + VALUES (?, ? || '\n' || ? || '\n' || ?) + ` + _, err = m.DB.Exec(stmt, lastId, a.Title, a.Author, a.Content) + return err +}