diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5643c1f --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +serv: + go run -tags='sqlite_fts5' cmd/frontend/* +crawl: + go run -tags='sqlite_fts5' cmd/crawler/main.go diff --git a/assets/migrations/20250103140557_response_cache.sql b/assets/migrations/20250103140557_response_cache.sql new file mode 100644 index 0000000..0c8d816 --- /dev/null +++ b/assets/migrations/20250103140557_response_cache.sql @@ -0,0 +1,13 @@ +-- +goose Up +-- +goose StatementBegin +CREATE TABLE responses ( + url VARCHAR(255) NOT NULL UNIQUE PRIMARY KEY, + content TEXT NOT NULL, + fetchDate DATETIME DEFAULT CURRENT_TIMESTAMP +); +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +DROP TABLE responses; +-- +goose StatementEnd diff --git a/assets/migrations/20250103203320_response_new_column.sql b/assets/migrations/20250103203320_response_new_column.sql new file mode 100644 index 0000000..9c5b714 --- /dev/null +++ b/assets/migrations/20250103203320_response_new_column.sql @@ -0,0 +1,9 @@ +-- +goose Up +-- +goose StatementBegin +ALTER TABLE responses ADD COLUMN processed BOOLEAN DEFAULT false; +-- +goose StatementEnd + +-- +goose Down +-- +goose StatementBegin +ALTER TABLE responses DROP COLUMN processed; +-- +goose StatementEnd diff --git a/cmd/crawler/collectors/collector.go b/cmd/crawler/collectors/collector.go new file mode 100644 index 0000000..c822fc3 --- /dev/null +++ b/cmd/crawler/collectors/collector.go @@ -0,0 +1,7 @@ +package collectors + +import "crowsnest/internal/model/sqlite" + +type Collector struct { + Responses *sqlite.ResponseModel +} diff --git a/cmd/crawler/collectors/spiegel.go b/cmd/crawler/collectors/spiegel.go new file mode 100644 index 0000000..9eccdfb --- /dev/null +++ b/cmd/crawler/collectors/spiegel.go @@ -0,0 +1,110 @@ +package collectors + +import ( + //"crowsnest/internal/model" + //"regexp" + //"time" + //"strings" + + "fmt" + "time" + + "github.com/gocolly/colly/v2" +) + + +func (c *Collector) Spiegel() { + collycollector := colly.NewCollector( + colly.AllowedDomains("www.spiegel.de", "spiegel.de"), + colly.CacheDir("./persistence/spiegel_cache"), + colly.MaxDepth(3), + ) + + // cascade + collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { + e.Request.Visit(e.Attr("href")) + }) + + // cache + collycollector.OnScraped(func(r *colly.Response) { + c.Responses.Insert(r.Request.URL.String(), string(r.Body)) + }) + + // go through archive + startDate := time.Date(2000, time.January, 1, 0, 0, 0, 0, time.UTC) + currentDate := time.Now() + + for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 1) { + urlDate := date.Format("02.01.2006") + url := fmt.Sprintf("https://www.spiegel.de/nachrichtenarchiv/artikel-%s.html", urlDate) + + collycollector.Visit(url) + } + + //// create entry if not behind paywall + //paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") + //collycollector.OnResponse(func(r *colly.Response) { + // if paywall_false_pattern.Match(r.Body) { + // url := r.Request.URL.String() + // (*results)[url] = &model.Article{ + // SourceUrl: url, + // FetchDate: time.Now(), + // Content: "", + // } + // } + + //}) + + //// check for article type + //collycollector.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) { + // if e.Attr("content") != "article" { + // delete(*results, e.Request.URL.String()) + // } + //}) + + //// add title + //collycollector.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) { + // if val, ok := (*results)[e.Request.URL.String()]; ok { + // val.Title = e.Attr("content") + // } + //}) + + //// prepend description to content of article + //collycollector.OnHTML("meta[name='description']", func(e *colly.HTMLElement) { + // if val, ok := (*results)[e.Request.URL.String()]; ok { + // val.Content = e.Attr("content") + val.Content + // } + //}) + + //// add publishing date + //collycollector.OnHTML("meta[name='date']", func(e *colly.HTMLElement) { + // if val, ok := (*results)[e.Request.URL.String()]; ok { + // t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content")) + // if err != nil { + // panic(err) + // } + // val.PublishDate = t + // } + //}) + + //// add author + //collycollector.OnHTML("meta[name='author']", func(e *colly.HTMLElement) { + // if val, ok := (*results)[e.Request.URL.String()]; ok { + // val.Author = e.Attr("content") + // } + //}) + + //// add content + //collycollector.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) { + // if val, ok := (*results)[e.Request.URL.String()]; ok { + // cont := val.Content + + // pattern := regexp.MustCompile("\\s+") + // cont = string(pattern.ReplaceAll([]byte(cont), []byte(" "))) + // cont = strings.ReplaceAll(cont, "»", "\"") + // cont = strings.ReplaceAll(cont, "«", "\"") + // val.Content = cont + " " + e.Text + // } + //}) + +} diff --git a/cmd/crawler/collectors/zeit.go b/cmd/crawler/collectors/zeit.go new file mode 100644 index 0000000..7b7bffb --- /dev/null +++ b/cmd/crawler/collectors/zeit.go @@ -0,0 +1,39 @@ +package collectors + +import ( + "fmt" + "time" + + "github.com/gocolly/colly/v2" +) + +// Gets every page of the archive of zeit.de and stores the responses into the +// database. +func (c *Collector) Zeit() { + collycollector := colly.NewCollector( + colly.AllowedDomains("www.zeit.de", "zeit.de"), + colly.CacheDir("./persistence/zeit_cache"), + colly.MaxDepth(2), + ) + + // cascade + collycollector.OnHTML("a[href]", func(e *colly.HTMLElement) { + e.Request.Visit(e.Attr("href")) + }) + + // cache + collycollector.OnScraped(func(r *colly.Response) { + c.Responses.Insert(r.Request.URL.String(), string(r.Body)) + }) + + // go through archive + startDate := time.Date(1946, time.January, 1, 0, 0, 0, 0, time.UTC) + currentDate := time.Now() + + for date := startDate; date.Before(currentDate) || date.Equal(currentDate); date = date.AddDate(0, 0, 7) { + year, week := date.ISOWeek() + url := fmt.Sprintf("https://www.zeit.de/%04d/%02d/index", year, week) + + collycollector.Visit(url) + } +} diff --git a/cmd/crawler/extractors/extractor.go b/cmd/crawler/extractors/extractor.go new file mode 100644 index 0000000..b83736e --- /dev/null +++ b/cmd/crawler/extractors/extractor.go @@ -0,0 +1,8 @@ +package extractors + +import "crowsnest/internal/model/sqlite" + +type Extractor struct { + Responses *sqlite.ResponseModel + Articles *sqlite.ArticleModel +} diff --git a/cmd/crawler/extractors/spiegel.go b/cmd/crawler/extractors/spiegel.go new file mode 100644 index 0000000..5c2bc4f --- /dev/null +++ b/cmd/crawler/extractors/spiegel.go @@ -0,0 +1,113 @@ +package extractors + +import ( + "crowsnest/internal/model" + "log" + "regexp" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" +) + + +func (extractor *Extractor) Spiegel() error { + // get urls to process + urls, err := extractor.Responses.UnprocessedUrls() + if err != nil { return err } + + paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") + url_pattern := regexp.MustCompile("^https://(www\\.)?spiegel.de.*") + whitespace := regexp.MustCompile("\\s+") + + var exists bool + var pagetype, title, content, datestr, author string + var tag *goquery.Selection + var date time.Time + + for _, url := range urls { + // check url url pattern + if !url_pattern.Match([]byte(url)) { continue } + + // get response + res, err := extractor.Responses.GetByUrl(url) + if err != nil { + log.Println("failed to process url", url, "with", err) + continue + } + + // check for paywall + if !paywall_false_pattern.Match([]byte(res.Content)) { + extractor.Responses.Processed(url) + continue + } + + // construct goquery doc + doc, err := goquery.NewDocumentFromReader(strings.NewReader(res.Content)) + if err != nil { + log.Println("failed to process url", url, "with", err) + continue + } + + // check for article type + tag = doc.Find("meta[property='og:type']") + pagetype, exists = tag.Attr("content") + if !exists || pagetype != "article" { extractor.Responses.Processed(url); continue; } + + // get title + tag = doc.Find("meta[property='og:title']") + title, exists = tag.Attr("content") + if !exists { extractor.Responses.Processed(url); continue; } + + // prepend description to content of article + tag = doc.Find("meta[name='description']") + content, exists = tag.Attr("content") + content += " " + if !exists { extractor.Responses.Processed(url); continue; } + + // get publishing date + tag = doc.Find("meta[name='date']") + datestr, exists = tag.Attr("content") + if !exists { extractor.Responses.Processed(url); continue; } + + date, err = time.Parse("2006-01-02T15:04:05-07:00", datestr) + if err != nil { extractor.Responses.Processed(url); continue; } + + // get author + tag = doc.Find("meta[name='author']") + author, exists = tag.Attr("content") + if !exists { extractor.Responses.Processed(url); continue; } + + // get content + tag = doc.Find("main[id='Inhalt'] div > p") + + tag.Each(func(index int, p *goquery.Selection) { + content += " " + p.Text() + }) + + // clean up content string + content = string(whitespace.ReplaceAll([]byte(content), []byte(" "))) + content = strings.ReplaceAll(content, "»", "\"") + content = strings.ReplaceAll(content, "«", "\"") + + // insert new article + article := model.Article{ + SourceUrl: url, + PublishDate: date, + FetchDate: res.FetchDate, + Title: title, + Content: content, + Author: author, + } + + err = extractor.Articles.Insert(&article) + if err != nil { + log.Println("failed to insert", article) + } else { + extractor.Responses.Processed(url) + log.Println("found article at", url) + } + } + + return nil +} diff --git a/cmd/crawler/main.go b/cmd/crawler/main.go index 5c95458..377d269 100644 --- a/cmd/crawler/main.go +++ b/cmd/crawler/main.go @@ -1,114 +1,35 @@ package main import ( - "crowsnest/internal/model" + "crowsnest/cmd/crawler/collectors" + "crowsnest/cmd/crawler/extractors" "crowsnest/internal/model/sqlite" "database/sql" - "fmt" "log" - "regexp" - "strings" - "time" - "github.com/gocolly/colly/v2" _ "github.com/mattn/go-sqlite3" ) -func spiegelCollector(results *map[string]*model.Article) *colly.Collector { - c := colly.NewCollector( - colly.AllowedDomains("www.spiegel.de", "spiegel.de"), - colly.CacheDir("./persistence/spiegel_cache"), - colly.MaxDepth(5), - ) - - // create entry if not behind paywall - paywall_false_pattern := regexp.MustCompile("\"paywall\":{\"attributes\":{\"is_active\":false") - c.OnResponse(func(r *colly.Response) { - if paywall_false_pattern.Match(r.Body) { - url := r.Request.URL.String() - (*results)[url] = &model.Article{ - SourceUrl: url, - FetchDate: time.Now(), - Content: "", - } - } - - }) - - // check for article type - c.OnHTML("meta[property='og:type']", func(e *colly.HTMLElement) { - if e.Attr("content") != "article" { - } - }) - - // add title - c.OnHTML("meta[property='og:title']", func(e *colly.HTMLElement) { - if val, ok := (*results)[e.Request.URL.String()]; ok { - val.Title = e.Attr("content") - } - }) - - // prepend description to content of article - c.OnHTML("meta[name='description']", func(e *colly.HTMLElement) { - if val, ok := (*results)[e.Request.URL.String()]; ok { - val.Content = e.Attr("content") + val.Content - } - }) - - // add publishing date - c.OnHTML("meta[name='date']", func(e *colly.HTMLElement) { - if val, ok := (*results)[e.Request.URL.String()]; ok { - t, err := time.Parse("2006-01-02T15:04:05-07:00", e.Attr("content")) - if err != nil { - panic(err) - } - val.PublishDate = t - } - }) - - // add author - c.OnHTML("meta[name='author']", func(e *colly.HTMLElement) { - if val, ok := (*results)[e.Request.URL.String()]; ok { - val.Author = e.Attr("content") - } - }) - - // add content - c.OnHTML("main[id='Inhalt'] div > p", func(e *colly.HTMLElement) { - if val, ok := (*results)[e.Request.URL.String()]; ok { - cont := val.Content - - pattern := regexp.MustCompile("\\s+") - cont = string(pattern.ReplaceAll([]byte(cont), []byte(" "))) - cont = strings.ReplaceAll(cont, "»", "\"") - cont = strings.ReplaceAll(cont, "«", "\"") - val.Content = cont + " " + e.Text - } - }) - - // cascade - c.OnHTML("a[href]", func(e *colly.HTMLElement) { - e.Request.Visit(e.Attr("href")) - }) - - return c -} - func main() { - res := make(map[string]*model.Article) - c := spiegelCollector(&res) - - c.Visit("https://www.spiegel.de/") - - db, err := sql.Open("sqlite3", "./persistence/app.db") - if err != nil { log.Fatal(err) } - - db_articles := &sqlite.ArticleModel{ DB: db } - - counter := 0 - for _, val := range res { - counter++ - db_articles.Insert(val) + // open database + db, err := sql.Open("sqlite3", "./persistence/app.db") + if err != nil { + log.Fatal(err) } - fmt.Println(counter) + + // collect websites + _ = collectors.Collector{ + Responses: &sqlite.ResponseModel{DB: db}, + } + + //coll.Spiegel() + //coll.Zeit() + + // extract articles from websites + extr := extractors.Extractor{ + Responses: &sqlite.ResponseModel{DB: db}, + Articles: &sqlite.ArticleModel{DB: db}, + } + + extr.Spiegel() } diff --git a/go.mod b/go.mod index a939b96..ad876c0 100644 --- a/go.mod +++ b/go.mod @@ -3,8 +3,8 @@ module crowsnest go 1.23.3 require ( - github.com/PuerkitoBio/goquery v1.5.1 // indirect - github.com/andybalholm/cascadia v1.2.0 // indirect + github.com/PuerkitoBio/goquery v1.10.1 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect github.com/antchfx/htmlquery v1.2.3 // indirect github.com/antchfx/xmlquery v1.2.4 // indirect github.com/antchfx/xpath v1.1.8 // indirect @@ -16,8 +16,8 @@ require ( github.com/mattn/go-sqlite3 v1.14.24 // indirect github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect github.com/temoto/robotstxt v1.1.1 // indirect - golang.org/x/net v0.0.0-20200602114024-627f9648deb9 // indirect - golang.org/x/text v0.3.2 // indirect + golang.org/x/net v0.33.0 // indirect + golang.org/x/text v0.21.0 // indirect google.golang.org/appengine v1.6.6 // indirect google.golang.org/protobuf v1.24.0 // indirect ) diff --git a/go.sum b/go.sum index ba2cd68..024088f 100644 --- a/go.sum +++ b/go.sum @@ -2,9 +2,13 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/PuerkitoBio/goquery v1.10.1 h1:Y8JGYUkXWTGRB6Ars3+j3kN0xg1YqqlwvdTV8WTFQcU= +github.com/PuerkitoBio/goquery v1.10.1/go.mod h1:IYiHrOMps66ag56LEH7QYDDupKXyo5A8qrjIx3ZtujY= github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4= @@ -43,6 +47,7 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= @@ -57,12 +62,23 @@ github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoH github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -70,27 +86,77 @@ golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM= golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= diff --git a/internal/model/response.go b/internal/model/response.go new file mode 100644 index 0000000..4db9b6d --- /dev/null +++ b/internal/model/response.go @@ -0,0 +1,11 @@ +package model + +import "time" + +// A simple cache for requests. +type Response struct { + Url string + Content string + Processed bool + FetchDate time.Time +} diff --git a/internal/model/sqlite/articles.go b/internal/model/sqlite/articles.go index 779b93b..12864c2 100644 --- a/internal/model/sqlite/articles.go +++ b/internal/model/sqlite/articles.go @@ -85,3 +85,25 @@ func (m *ArticleModel) Insert(a *model.Article) error { _, err = m.DB.Exec(stmt, lastId, a.Title, a.Author, a.Content) return err } + +// TODO docstring +func (m *ArticleModel) Update(a *model.Article) error { + // begin transaction + _, err := m.DB.Begin() + if err != nil { return err } + + // insert article + stmt := `UPDATE articles + SET title = ?, sourceUrl = ?, author = ?, content = ?, publishDate = ?, fetchDate = ? + WHERE id = ? + ` + _, err = m.DB.Exec(stmt, a.Title, a.SourceUrl, a.Author, a.Content, a.PublishDate, a.FetchDate, a.Identifier) + if err != nil { return err } + + // insert into fts_articles for full-text search + stmt = `INSERT INTO fts_articles (id, content) + VALUES (?, ? || '\n' || ? || '\n' || ?) + ` + _, err = m.DB.Exec(stmt, a.Identifier, a.Title, a.Author, a.Content) + return err +} diff --git a/internal/model/sqlite/responeses.go b/internal/model/sqlite/responeses.go new file mode 100644 index 0000000..66e46c0 --- /dev/null +++ b/internal/model/sqlite/responeses.go @@ -0,0 +1,91 @@ +package sqlite + +import ( + "crowsnest/internal/model" + "database/sql" +) + +type ResponseModel struct { + DB *sql.DB +} + + +// TODO docstring +func (m *ResponseModel) All() ([]model.Response, error) { + stmt := ` + SELECT url, content, fetchDate, processed + FROM responses + ` + rows, err := m.DB.Query(stmt) + if err != nil { return nil, err } + + responses := []model.Response{} + for rows.Next() { + r := model.Response{} + err := rows.Scan(&r.Url, &r.Content, &r.FetchDate, &r.Processed) + if err != nil { return nil, err } + + responses = append(responses, r) + } + + if err = rows.Err(); err != nil { return nil, err } + + return responses, nil +} + +// TODO docstring +func (m *ResponseModel) UnprocessedUrls() ([]string, error) { + stmt := ` + SELECT url + FROM responses + WHERE NOT processed + ` + rows, err := m.DB.Query(stmt) + if err != nil { return nil, err } + + urls := make([]string, 0) + for rows.Next() { + r := "" + err := rows.Scan(&r) + if err != nil { return nil, err } + + urls = append(urls, r) + } + + if err = rows.Err(); err != nil { return nil, err } + + return urls, nil +} + +// TODO docstring +func (m *ResponseModel) GetByUrl(url string) (model.Response, error) { + stmt := ` + SELECT url, content, fetchDate, processed + FROM responses + WHERE url = ? + ` + + res := model.Response{} + row := m.DB.QueryRow(stmt, url) + err := row.Scan(&res.Url, &res.Content, &res.FetchDate, &res.Processed) + + return res, err +} + +// TODO docstring +func (m *ResponseModel) Insert(url string, content string) error { + // insert response + stmt := `INSERT INTO responses (url, content) VALUES (?, ?)` + _, err := m.DB.Exec(stmt, url, content) + + return err +} + +// TODO docstring +func (m *ResponseModel) Processed(url string) error { + // insert response + stmt := `UPDATE responses SET processed = true WHERE url = ?` + _, err := m.DB.Exec(stmt, url) + + return err +}