diff --git a/assets/migrations/20250107104609_drop_column_author.sql b/assets/migrations/20250107104609_drop_column_author.sql new file mode 100644 index 0000000..e66d51f --- /dev/null +++ b/assets/migrations/20250107104609_drop_column_author.sql @@ -0,0 +1,36 @@ +-- +goose Up +-- +goose StatementBegin +BEGIN; + +DROP INDEX IF EXISTS articles_fts_idx; +ALTER TABLE articles DROP COLUMN IF EXISTS fts_vector; + +ALTER TABLE articles DROP COLUMN IF EXISTS author; + +ALTER TABLE articles +ADD COLUMN fts_vector tsvector GENERATED ALWAYS AS ( + to_tsvector('german', coalesce(title, '') || ' ' || coalesce(content, '')) +) STORED; +CREATE INDEX articles_fts_idx ON articles USING gin(fts_vector); + +COMMIT; +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +BEGIN; + +ALTER TABLE articles ADD COLUMN author VARCHAR(255) DEFAULT ''; + +DROP INDEX IF EXISTS articles_fts_idx; +ALTER TABLE articles DROP COLUMN IF EXISTS fts_vector; + +ALTER TABLE articles +ADD COLUMN fts_vector tsvector GENERATED ALWAYS AS ( + to_tsvector('german', coalesce(title, '') || ' ' || coalesce(content, '') || ' ' || coalesce(author, '')) +) STORED; + +CREATE INDEX articles_fts_idx ON articles USING gin(fts_vector); + +COMMIT; +-- +goose StatementEnd \ No newline at end of file diff --git a/cmd/crawler/collectors/spiegel.go b/cmd/crawler/collectors/spiegel.go index 08dbd9b..40e8644 100644 --- a/cmd/crawler/collectors/spiegel.go +++ b/cmd/crawler/collectors/spiegel.go @@ -59,7 +59,7 @@ func (c *Collector) ExtractSpiegel(url string, body []byte) error { whitespace := regexp.MustCompile(`\s+`) var exists bool - var pagetype, title, content, datestr, author string + var pagetype, title, content, datestr string var tag *goquery.Selection var date time.Time @@ -113,13 +113,6 @@ func (c *Collector) ExtractSpiegel(url string, body []byte) error { return err } - // get author - tag = doc.Find("meta[name='author']") - author, exists = tag.Attr("content") - if !exists { - return errors.New("unable to extract article, no author tag") - } - // get content tag = doc.Find("main[id='Inhalt'] div > p") @@ -139,7 +132,6 @@ func (c *Collector) ExtractSpiegel(url string, body []byte) error { FetchDate: time.Now(), Title: title, Content: content, - Author: author, } err = c.Articles.Insert(&article) diff --git a/internal/model/article.go b/internal/model/article.go index b26fc8e..b1f631c 100644 --- a/internal/model/article.go +++ b/internal/model/article.go @@ -1,58 +1,47 @@ package model import ( - "time" - //"strings" - "net/url" + "net/url" + "time" ) - // TODO docstring type Article struct { - Identifier int - SourceUrl string - PublishDate time.Time - FetchDate time.Time - Title string - Content string - Author string + Identifier int + SourceUrl string + PublishDate time.Time + FetchDate time.Time + Title string + Content string } // TODO docstring type ArticleViewModel struct { - Title string - Author string - PublishDate string - SourceUrl string - ShortSource string - Summary string + Title string + PublishDate string + SourceUrl string + ShortSource string + Summary string } - // TODO docstring func (a *Article) ViewModel() *ArticleViewModel { - summary := a.Content - if len(a.Content) > 300 { - summary = summary[:300] - } + summary := a.Content + if len(a.Content) > 300 { + summary = summary[:300] + } - short_url := "" + short_url := "" parsedURL, err := url.Parse(a.SourceUrl) - if err == nil { - short_url = parsedURL.Hostname() + if err == nil { + short_url = parsedURL.Hostname() + } - //hostParts := strings.Split(short_url, ".") - //if len(hostParts) >= 2 { - // short_url = strings.Join(hostParts[len(hostParts)-2:], ".") - //} - } - - return &ArticleViewModel{ - Title: a.Title, - Author: a.Author, - PublishDate: a.PublishDate.Local().Format("02.01.2006"), - SourceUrl: a.SourceUrl, - ShortSource: short_url, - Summary: summary, - } + return &ArticleViewModel{ + Title: a.Title, + PublishDate: a.PublishDate.Local().Format("02.01.2006"), + SourceUrl: a.SourceUrl, + ShortSource: short_url, + Summary: summary, + } } diff --git a/internal/model/database/articles.go b/internal/model/database/articles.go index bb3c2e4..2ce6f84 100644 --- a/internal/model/database/articles.go +++ b/internal/model/database/articles.go @@ -13,7 +13,7 @@ type ArticleModel struct { // the connection to the database fails. func (m *ArticleModel) All(limit int) ([]model.Article, error) { stmt := ` - SELECT id, title, sourceUrl, author, content, publishDate, fetchDate + SELECT id, title, sourceUrl, content, publishDate, fetchDate FROM articles ORDER BY publishDate DESC LIMIT $1 @@ -26,7 +26,7 @@ func (m *ArticleModel) All(limit int) ([]model.Article, error) { articles := []model.Article{} for rows.Next() { a := model.Article{} - err := rows.Scan(&a.Identifier, &a.Title, &a.SourceUrl, &a.Author, &a.Content, &a.PublishDate, &a.FetchDate) + err := rows.Scan(&a.Identifier, &a.Title, &a.SourceUrl, &a.Content, &a.PublishDate, &a.FetchDate) if err != nil { return nil, err } @@ -46,7 +46,7 @@ func (m *ArticleModel) All(limit int) ([]model.Article, error) { // database fails. func (m *ArticleModel) Search(query string) ([]model.Article, error) { stmt := ` - SELECT id, title, sourceurl, author, content, publishdate, fetchDate + SELECT id, title, sourceurl, content, publishdate, fetchDate FROM articles WHERE fts_vector @@ to_tsquery('german', $1) ORDER BY ts_rank(fts_vector, to_tsquery('german', $1)) DESC @@ -61,7 +61,7 @@ func (m *ArticleModel) Search(query string) ([]model.Article, error) { articles := []model.Article{} for rows.Next() { a := model.Article{} - err := rows.Scan(&a.Identifier, &a.Title, &a.SourceUrl, &a.Author, &a.Content, &a.PublishDate, &a.FetchDate) + err := rows.Scan(&a.Identifier, &a.Title, &a.SourceUrl, &a.Content, &a.PublishDate, &a.FetchDate) if err != nil { return nil, err } @@ -80,19 +80,19 @@ func (m *ArticleModel) Search(query string) ([]model.Article, error) { // query fails. func (m *ArticleModel) Insert(a *model.Article) error { // insert article - stmt := `INSERT INTO articles (title, sourceUrl, author, content, publishDate, fetchDate) - VALUES ($1, $2, $3, $4, $5, $6) + stmt := `INSERT INTO articles (title, sourceUrl, content, publishDate, fetchDate) + VALUES ($1, $2, $3, $4, $5) ` - _, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate) + _, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Content, a.PublishDate, a.FetchDate) return err } // TODO docstring func (m *ArticleModel) Update(a *model.Article) error { stmt := `UPDATE articles - SET title = $1, sourceUrl = $2, author = $3, content = $4, publishDate = $5, fetchDate = $6 + SET title = $1, sourceUrl = $2, content = $4, publishDate = $5, fetchDate = $6 WHERE id = $7 ` - _, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate, a.Identifier) + _, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Content, a.PublishDate, a.FetchDate, a.Identifier) return err }