remove duplicate files #8

Merged
eliaskohout merged 12 commits from rm_duplicate_file into main 2025-01-20 21:29:36 +01:00
12 changed files with 0 additions and 860 deletions

View File

@@ -1,42 +0,0 @@
package app
import (
"html/template"
"net/http"
"strconv"
)
// Enpoint that returns a list of articles given search terms in the post
// request of a search form. Uses the content template.
func (app *App) Article(w http.ResponseWriter, req *http.Request) {
// get id
id, err := strconv.ParseUint(req.PathValue("id"), 10, 64)
if err != nil {
http.NotFound(w, req)
return
}
// get articles
article, err := app.articles.ById(int(id))
if err != nil {
// treat as no result
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// render template
t := template.Must(template.ParseFiles(
"assets/templates/articlePage.html",
"assets/templates/layout.html",
))
data := map[string]interface{}{
"SelectedNavItemArticle": false,
"ArticlePageVM": article.PageViewModel(),
}
err = t.ExecuteTemplate(w, "base", data)
if err != nil {
http.Error(w, "Failed to render template", http.StatusInternalServerError)
return
}
}

View File

@@ -1,53 +0,0 @@
package app
import (
"crowsnest/internal/model"
"html/template"
"net/http"
"strconv"
)
// List the latest articles using the base template.
func (app *App) Index(w http.ResponseWriter, req *http.Request) {
const pageSize = 15
var limit, offset, pageId uint64 = pageSize, 0, 0
var err error
// get page number
if pageId, err = strconv.ParseUint(req.PathValue("id"), 10, 32); err == nil {
pageId--
offset = pageId * pageSize
}
// get articles
articleVMs, err := app.articles.AllArticleViewModels(int(limit), int(offset))
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// get count of total articles
totalCount, err := app.articles.CountAll()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
totalCount /= pageSize
// render template
t := template.Must(template.ParseFiles(
"assets/templates/article.html",
"assets/templates/layout.html",
"assets/templates/components/pagination.html"))
data := map[string]interface{}{
"SelectedNavItemArticle": true,
"ArticleVMs": &articleVMs,
"Paginations": model.NewPaginationViewModel(uint(pageId+1), totalCount+1),
}
err = t.ExecuteTemplate(w, "base", data)
if err != nil {
http.Error(w, "Failed to render template", http.StatusInternalServerError)
return
}
}

View File

@@ -1,42 +0,0 @@
package app
import (
"html/template"
"net/http"
)
// Enpoint that returns a list of articles given search terms in the post
// request of a search form. Uses the content template.
func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) {
// construct search query
searchTerms := req.FormValue("search")
if searchTerms == "" {
app.Index(w, req)
return
}
// get articles
articleVMs, err := app.articles.SearchArticleViewModel(searchTerms)
if err != nil {
// treat as no result
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// render template
t := template.Must(template.ParseFiles(
"assets/templates/article.html",
"assets/templates/layout.html",
"assets/templates/components/pagination.html"))
data := map[string]interface{}{
"SelectedNavItemArticle": true,
"ArticleVMs": &articleVMs,
"Paginations": nil,
}
err = t.ExecuteTemplate(w, "base", data)
if err != nil {
http.Error(w, "Failed to render template", http.StatusInternalServerError)
return
}
}

View File

@@ -1,69 +0,0 @@
package crawler
import (
"crowsnest/internal/model"
"crowsnest/internal/util"
"github.com/gocolly/colly/v2"
)
type CrawlerFacade struct {
spiegelFeedDistributer *util.Distributer[*model.Article]
zeitFeedDistributer *util.Distributer[*model.Article]
}
func (cf *CrawlerFacade) Init() {
// init
cf.spiegelFeedDistributer = &util.Distributer[*model.Article]{}
cf.spiegelFeedDistributer.Init()
cf.zeitFeedDistributer = &util.Distributer[*model.Article]{}
cf.zeitFeedDistributer.Init()
// run spiegel feed
sf := &WebFeed{}
sf.Init(
"https://www.spiegel.de/",
colly.AllowedDomains("www.spiegel.de", "spiegel.de"),
colly.CacheDir("./persistence/spiegel_cache"),
colly.MaxDepth(1),
)
sf_feed := sf.Feed()
sf_converter := ConverterSpiegel{}
sf_converter.Init()
go func() {
for val := range sf_feed {
article, err := sf_converter.Convert(val)
if err != nil { continue }
cf.spiegelFeedDistributer.Publish(article)
}
}()
// run zeit feed
zf := &WebFeed{}
zf.Init(
"https://www.zeit.de/index",
colly.AllowedDomains("www.zeit.de", "zeit.de"),
colly.CacheDir("./persistence/zeit_cache"),
colly.MaxDepth(1),
)
zf_feed := zf.Feed()
zf_converter := ZeitConverter{}
zf_converter.Init()
go func() {
for val := range zf_feed {
article, err := zf_converter.Convert(val)
if err != nil { continue }
cf.zeitFeedDistributer.Publish(article)
}
}()
}
func (cf *CrawlerFacade) SubscribeToSpiegelFeed(hook func(*model.Article)) {
cf.spiegelFeedDistributer.Subscribe(hook)
}
func (cf *CrawlerFacade) SubscribeToZeitFeed(hook func(*model.Article)) {
cf.zeitFeedDistributer.Subscribe(hook)
}

View File

@@ -1,6 +0,0 @@
package crawler
type Resource struct {
Url string
Body string
}

View File

@@ -1,96 +0,0 @@
package crawler
import (
"crowsnest/internal/model"
"errors"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type ConverterSpiegel struct {
pattern_paywall *regexp.Regexp
pattern_url *regexp.Regexp
pattern_whitespace *regexp.Regexp
}
func (c *ConverterSpiegel) Init() {
c.pattern_paywall = regexp.MustCompile(`"paywall":{"attributes":{"is_active":true`)
c.pattern_url = regexp.MustCompile(`^https://(www\.)?spiegel.de.*`)
c.pattern_whitespace = regexp.MustCompile(`\s+`)
}
func (c *ConverterSpiegel) Convert(res *Resource) (*model.Article, error) {
// check url url pattern
if !c.pattern_url.Match([]byte(res.Url)) {
return nil, errors.New("invalid url pattern")
}
// check for paywall
if c.pattern_paywall.Match([]byte(res.Body)) {
return nil, errors.New("unable to extract article due to paywal")
}
// construct goquery doc
doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body))
if err != nil {
return nil, err
}
// check for article type
tag := doc.Find("meta[property='og:type']")
pagetype, exists := tag.Attr("content")
if !exists || pagetype != "article" {
return nil, errors.New("unable to extract article, not of type article")
}
// get title
tag = doc.Find("meta[property='og:title']")
title, exists := tag.Attr("content")
if !exists {
return nil, errors.New("unable to extract article, no title tag")
}
// prepend description to content of article
tag = doc.Find("meta[name='description']")
content, exists := tag.Attr("content")
content += " "
if !exists {
return nil, errors.New("unable to extract article, no description tag")
}
// get publishing date
tag = doc.Find("meta[name='date']")
datestr, exists := tag.Attr("content")
if !exists {
return nil, errors.New("unable to extract article, no date tag")
}
date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr)
if err != nil {
return nil, err
}
// get content
tag = doc.Find("main[id='Inhalt'] div > p")
tag.Each(func(index int, p *goquery.Selection) {
content += " " + p.Text()
})
// clean up content string
content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" ")))
content = strings.ReplaceAll(content, "»", "\"")
content = strings.ReplaceAll(content, "«", "\"")
// create new article
return &model.Article{
SourceUrl: res.Url,
PublishDate: date,
FetchDate: time.Now(),
Title: title,
Content: content,
}, nil
}

View File

@@ -1,67 +0,0 @@
package crawler
import (
"crowsnest/internal/util"
"log"
"strings"
"time"
"github.com/gocolly/colly/v2"
)
type WebFeed struct {
feed chan *Resource
collector *colly.Collector
}
// Init the WebFeed, starting the process of collecting Resources.
func (sf *WebFeed) Init(indexUrl string, options ...colly.CollectorOption) {
// create feed
sf.feed = make(chan *Resource, 100)
// set cache, domain pattern and max recursion depth
sf.collector = colly.NewCollector(options...)
// return IResources aka pages
sf.collector.OnResponse(func(r *colly.Response) {
url := r.Request.URL.String()
body := string(r.Body)
sf.feed <- &Resource{Url: url, Body: body}
})
// cascade
sf.collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
url := e.Attr("href")
if !strings.HasPrefix(url, "http") {
return
}
e.Request.Visit(url)
})
// start runner
go sf.runner(indexUrl)
}
// Get the channel into which the collected Resources will be written.
func (sf *WebFeed) Feed() <-chan *Resource {
return sf.feed
}
func (sf *WebFeed) runner(indexUrl string) {
for {
// sleep for 5min
time.Sleep(time.Second * 300)
// collect index
urls, err := util.GetAllURLs(indexUrl)
if err != nil {
log.Println("error in WebFeed runner: ", err.Error())
continue
}
// visit urls
for _, url := range urls {
sf.collector.Visit(url)
}
}
}

View File

@@ -1,100 +0,0 @@
package crawler
import (
"crowsnest/internal/model"
"errors"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type ZeitConverter struct {
pattern_url *regexp.Regexp
pattern_whitespace *regexp.Regexp
}
func (c *ZeitConverter) Init() {
c.pattern_url = regexp.MustCompile(`^https://(www\.)?zeit\.de[^#]*$`)
c.pattern_whitespace = regexp.MustCompile(`\s+`)
}
func (c *ZeitConverter) Convert(res *Resource) (*model.Article, error) {
// check url url pattern
if !c.pattern_url.Match([]byte(res.Url)) {
return nil, errors.New("invalid url pattern")
}
// construct goquery doc
doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Body))
if err != nil {
return nil, err
}
// check for article type
tag := doc.Find("meta[property='og:type']")
pagetype, exists := tag.Attr("content")
if !exists || pagetype != "article" {
return nil, errors.New("unable to extract article, not of type article")
}
// check for paywall
tag = doc.Find("meta[property='article:content_tier']")
pagetype, exists = tag.Attr("content")
if !exists || pagetype != "free" {
return nil, errors.New("unable to extract article due to paywal")
}
// get title
tag = doc.Find("meta[property='og:title']")
title, exists := tag.Attr("content")
if !exists {
return nil, errors.New("unable to extract article, no title tag")
}
// prepend description to content of article
tag = doc.Find("meta[name='description']")
content, exists := tag.Attr("content")
content += " "
if !exists {
return nil, errors.New("unable to extract article, no description tag")
}
if strings.Contains(content, "Das Liveblog") {
return nil, errors.New("unable to extract article, no support for liveblog")
}
// get publishing date
tag = doc.Find("meta[name='date']")
datestr, exists := tag.Attr("content")
if !exists {
return nil, errors.New("unable to extract article, no date tag")
}
date, err := time.Parse("2006-01-02T15:04:05-07:00", datestr)
if err != nil {
return nil, err
}
// get content
tag = doc.Find("main > article > div.article-body p.article__item")
tag.Each(func(index int, p *goquery.Selection) {
content += " " + p.Text()
})
// clean up content string
content = string(c.pattern_whitespace.ReplaceAll([]byte(content), []byte(" ")))
content = strings.ReplaceAll(content, "»", "\"")
content = strings.ReplaceAll(content, "«", "\"")
// create new article
return &model.Article{
SourceUrl: res.Url,
PublishDate: date,
FetchDate: time.Now(),
Title: title,
Content: content,
}, nil
}

View File

@@ -1,234 +0,0 @@
package database
import (
"crowsnest/internal/model"
"database/sql"
"net/url"
"strings"
)
type ArticleRepository struct {
DB *sql.DB
}
// Gets all the article objects from the database. This may throw an error if
// the connection to the database fails.
func (m *ArticleRepository) All(limit int, offset int) ([]model.Article, error) {
stmt := `
SELECT id, title, sourceUrl, content, publishDate, fetchDate
FROM articles
ORDER BY publishDate DESC
LIMIT $1 OFFSET $2
`
rows, err := m.DB.Query(stmt, limit, offset)
if err != nil {
return nil, err
}
articles := []model.Article{}
for rows.Next() {
a := model.Article{}
err := rows.Scan(&a.Id, &a.Title, &a.SourceUrl, &a.Content, &a.PublishDate, &a.FetchDate)
if err != nil {
return nil, err
}
articles = append(articles, a)
}
if err = rows.Err(); err != nil {
return nil, err
}
return articles, nil
}
func (m *ArticleRepository) AllArticleViewModels(limit int, offset int) ([]*model.ArticleViewModel, error) {
stmt := `
SELECT a.id, a.title, a.sourceUrl, a.publishDate, d.summary
FROM articles a JOIN documents d ON a.document_id = d.id
ORDER BY a.publishDate DESC
LIMIT $1 OFFSET $2
`
rows, err := m.DB.Query(stmt, limit, offset)
if err != nil {
return nil, err
}
articleVMs := []*model.ArticleViewModel{}
var sourceUrl string
for rows.Next() {
a := model.ArticleViewModel{}
err := rows.Scan(&a.Id, &a.Title, &sourceUrl, &a.PublishDate, &a.Summary)
if err != nil {
return nil, err
}
// summary
if a.Summary == "" {
a.Summary = "N/A"
}
// short url
parsedURL, err := url.Parse(sourceUrl)
if err == nil {
a.ShortSource = parsedURL.Hostname()
} else {
a.ShortSource = ""
}
// ai summary always false
a.AiSummarized = false
articleVMs = append(articleVMs, &a)
}
if err = rows.Err(); err != nil {
return nil, err
}
return articleVMs, nil
}
// Counts all articles in the database. This may throw an error if the
// connection to the database fails.
func (m *ArticleRepository) CountAll() (uint, error) {
stmt := `SELECT count(id) FROM articles `
rows := m.DB.QueryRow(stmt)
count := uint(0)
if err := rows.Scan(&count); err != nil {
return 0, err
}
return count, nil
}
// Will use the full-text search features of the underlying database to search
// articles for a given search query. This may fail if the connection to the
// database fails.
func (m *ArticleRepository) SearchArticleViewModel(query string) ([]*model.ArticleViewModel, error) {
stmt := `
SELECT a.id, a.title, a.sourceUrl, a.publishDate, d.summary
FROM articles a JOIN documents d ON a.document_id = d.id
WHERE to_tsvector('german', d.content) @@ to_tsquery('german', $1)
ORDER BY ts_rank(to_tsvector('german', d.content), to_tsquery('german', $1)) DESC
LIMIT 10
`
query = strings.Join(strings.Split(strings.TrimSpace(query), " "), " | ")
rows, err := m.DB.Query(stmt, query)
if err != nil {
return nil, err
}
articleVMs := []*model.ArticleViewModel{}
for rows.Next() {
a := &model.ArticleViewModel{}
var sourceUrl string
err := rows.Scan(&a.Id, &a.Title, &sourceUrl, &a.PublishDate, &a.Summary)
if err != nil {
return nil, err
}
// summary
if a.Summary == "" {
a.Summary = "N/A"
}
// short url
parsedURL, err := url.Parse(sourceUrl)
if err == nil {
a.ShortSource = parsedURL.Hostname()
} else {
a.ShortSource = ""
}
// ai summary always false
a.AiSummarized = false
articleVMs = append(articleVMs, a)
}
if err = rows.Err(); err != nil {
return nil, err
}
return articleVMs, nil
}
// Will use the full-text search features of the underlying database to search
// articles for a given search query. This may fail if the connection to the
// database fails.
func (m *ArticleRepository) Search(query string) ([]model.Article, error) {
stmt := `
SELECT a.id, a.title, a.sourceurl, a.content, a.publishdate, a.fetchDate
FROM articles a JOIN documents d ON a.document_id = d.id
WHERE to_tsvector('german', d.content) @@ to_tsquery('german', $1)
ORDER BY ts_rank(to_tsvector('german', d.content), to_tsquery('german', $1)) DESC
LIMIT 10
`
query = strings.Join(strings.Split(strings.TrimSpace(query), " "), " | ")
rows, err := m.DB.Query(stmt, query)
if err != nil {
return nil, err
}
articles := []model.Article{}
for rows.Next() {
a := model.Article{}
err := rows.Scan(&a.Id, &a.Title, &a.SourceUrl, &a.Content, &a.PublishDate, &a.FetchDate)
if err != nil {
return nil, err
}
articles = append(articles, a)
}
if err = rows.Err(); err != nil {
return nil, err
}
return articles, nil
}
// Will return an article given an id. This may fail if the connection to the
// database fails or there is no aritcle with the given id.
func (m *ArticleRepository) ById(id int) (*model.Article, error) {
stmt := `
SELECT a.id, a.title, a.sourceurl, a.content, a.publishdate, a.fetchDate
FROM articles a
WHERE a.id = $1
`
rows := m.DB.QueryRow(stmt, id)
a := &model.Article{}
if err := rows.Scan(&a.Id, &a.Title, &a.SourceUrl, &a.Content, &a.PublishDate, &a.FetchDate); err != nil {
return nil, err
}
return a, nil
}
// Inserts a new article into the database. The id attribute of the given
// article will be ignored. May throw an error if the execution of the database
// query fails.
func (m *ArticleRepository) Insert(a *model.Article) error {
// insert article
stmt := `INSERT INTO articles (title, sourceUrl, content, publishDate, fetchDate)
VALUES ($1, $2, $3, $4, $5, $6)
`
_, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Content, a.PublishDate, a.FetchDate)
return err
}
func (m *ArticleRepository) Update(a *model.Article) error {
stmt := `UPDATE articles
SET title = $1, sourceUrl = $2, content = $4, publishDate = $5, fetchDate = $6
WHERE id = $8
`
_, err := m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Content, a.PublishDate, a.FetchDate, a.Id)
return err
}

View File

@@ -1,109 +0,0 @@
package database
import (
"crowsnest/internal/model"
"database/sql"
)
type DocumentRepository struct {
DB *sql.DB
}
// Gets all the documents objects from the database. This may throw an error if
// the connection to the database fails.
func (d *DocumentRepository) All(limit int, offset int) ([]*model.Document, error) {
stmt := `
SELECT id, content, summary
FROM documents
LIMIT $1 OFFSET $2
`
rows, err := d.DB.Query(stmt, limit, offset)
if err != nil {
return nil, err
}
docs := []*model.Document{}
for rows.Next() {
d := model.Document{}
err := rows.Scan(&d.Id, &d.Content, &d.Summary)
if err != nil {
return nil, err
}
docs = append(docs, &d)
}
if err = rows.Err(); err != nil {
return nil, err
}
return docs, nil
}
// Will return an article given an id. This may fail if the connection to the
// database fails or there is no aritcle with the given id.
func (m *DocumentRepository) ById(id int) (*model.Document, error) {
stmt := `
SELECT id, content, summary
FROM documents
WHERE a.id = $1
`
rows := m.DB.QueryRow(stmt, id)
d := &model.Document{}
if err := rows.Scan(&d.Id, &d.Content, &d.Summary); err != nil {
return nil, err
}
return d, nil
}
// Counts all documents in the database. This may throw an error if the
// connection to the database fails.
func (d *DocumentRepository) CountAll() (uint, error) {
stmt := `SELECT count(id) FROM documents`
rows := d.DB.QueryRow(stmt)
count := uint(0)
if err := rows.Scan(&count); err != nil {
return 0, err
}
return count, nil
}
func (m *DocumentRepository) Update(d *model.Document) error {
stmt := `UPDATE documents
SET content = $1, summary = $2
WHERE id = $3
`
_, err := m.DB.Exec(stmt, d.Content, d.Summary, d.Id)
return err
}
func (d *DocumentRepository) Map(transform func(*model.Document) *model.Document) (int, error) {
processed := 0
count, err := d.CountAll()
if err != nil {
return processed, err
}
for i := 0; i < int(count); i += 10 {
docs, err := d.All(10, i)
if err != nil {
return processed, err
}
for _, doc := range docs {
new_doc := transform(doc)
err = d.Update(new_doc)
if err != nil { return processed, err }
processed++
}
}
return processed, nil
}

View File

@@ -1,37 +0,0 @@
package util
type Distributer[T IClone[T]] struct {
queue chan T
hooks []func(T)
}
func (d *Distributer[T]) Init() {
d.queue = make(chan T, 100)
d.hooks = make([]func(T), 0)
}
// Distribute a copy of an item to every hook that has described to this
// Collector.
func (d *Distributer[T]) Publish(item T) {
d.queue <- item
}
// Add a new hook to the Collector. The hook will be called async whenever a
// new item is published.
func (d *Distributer[T]) Subscribe(hook func(T)) {
d.hooks = append(d.hooks, hook)
if len(d.hooks) == 1 {
go d.runner()
}
}
// Will be started to run async when Subscribe is first called. Whenever
// Publish is called the runner will distribute a clone of the new item to
// every hook.
func (d *Distributer[T]) runner() {
for val := range d.queue {
for _, f := range d.hooks {
go f(val.Clone())
}
}
}

View File

@@ -1,5 +0,0 @@
package util
type IClone[T any] interface {
Clone() T
}