Compare commits

...

3 Commits

Author SHA1 Message Date
fd25d7ebbc Merge branch 'main' of git.kohout-dev.de:crowsnest/crowsnest 2025-01-20 20:44:29 +01:00
32a0712124 adding .env file 2025-01-20 20:40:42 +01:00
637a5ebb0c add summarization for documents 2025-01-20 20:34:23 +01:00
8 changed files with 279 additions and 15 deletions

3
.gitignore vendored
View File

@@ -23,3 +23,6 @@ go.work
persistence persistence
.DS_Store .DS_Store
# env
.env

View File

@@ -1,11 +1,11 @@
DB_HOST="10.99.0.3" # Load the .env file
DB_PORT="5432" ifneq (,$(wildcard ./.env))
DB_NAME="crowsnest_dev" include .env
DB_USER="crow_dev" export $(shell sed 's/=.*//' .env)
DB_PASS="hL0VlXkH2WoHL7c7FdRTHXMy" endif
go-run: go-run:
cd src; DB_USER=$(DB_USER) DB_PASS=$(DB_PASS) DB_NAME=$(DB_NAME) DB_HOST=$(DB_HOST) go run cmd/frontend/main.go cd src; go run cmd/frontend/main.go;
migrate-up: migrate-up:
goose -dir=./src/assets/migrations/ postgres "postgresql://$(DB_USER):$(DB_PASS)@$(DB_HOST):$(DB_PORT)/$(DB_NAME)" up goose -dir=./src/assets/migrations/ postgres "postgresql://$(DB_USER):$(DB_PASS)@$(DB_HOST):$(DB_PORT)/$(DB_NAME)" up

View File

@@ -6,6 +6,7 @@ import (
"crowsnest/internal/middleware" "crowsnest/internal/middleware"
"crowsnest/internal/model" "crowsnest/internal/model"
"crowsnest/internal/model/database" "crowsnest/internal/model/database"
"crowsnest/internal/util"
"log" "log"
"net/http" "net/http"
@@ -18,6 +19,21 @@ func main() {
log.Fatal("failed to connect to database due to", err.Error()) log.Fatal("failed to connect to database due to", err.Error())
} }
// summarize documents
documents := &database.DocumentRepository{DB: db}
go documents.Map(func(doc *model.Document) *model.Document {
if doc.Summary == "" {
summaryText, err := util.Summarize(doc.Content)
if err == nil {
doc.Summary = summaryText
return doc
}
log.Println(err.Error())
}
return doc
})
// run web crawlers // run web crawlers
articles := &database.ArticleRepository{DB: db} articles := &database.ArticleRepository{DB: db}
crawler := crawler.CrawlerFacade{} crawler := crawler.CrawlerFacade{}

View File

@@ -1,7 +1,6 @@
package app package app
import ( import (
"crowsnest/internal/model"
"html/template" "html/template"
"net/http" "net/http"
) )
@@ -17,19 +16,13 @@ func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) {
} }
// get articles // get articles
articles, err := app.articles.Search(searchTerms) articleVMs, err := app.articles.SearchArticleViewModel(searchTerms)
if err != nil { if err != nil {
// treat as no result // treat as no result
http.Error(w, err.Error(), http.StatusInternalServerError) http.Error(w, err.Error(), http.StatusInternalServerError)
return return
} }
// convert to viewmodel
articleVMs := make([]*model.ArticleViewModel, 0, len(articles))
for _, a := range articles {
articleVMs = append(articleVMs, a.ViewModel())
}
// render template // render template
t := template.Must(template.ParseFiles( t := template.Must(template.ParseFiles(
"assets/templates/article.html", "assets/templates/article.html",

View File

@@ -107,6 +107,57 @@ func (m *ArticleRepository) CountAll() (uint, error) {
return count, nil return count, nil
} }
// Will use the full-text search features of the underlying database to search
// articles for a given search query. This may fail if the connection to the
// database fails.
func (m *ArticleRepository) SearchArticleViewModel(query string) ([]*model.ArticleViewModel, error) {
stmt := `
SELECT a.id, a.title, a.sourceUrl, a.publishDate, d.summary
FROM articles a JOIN documents d ON a.document_id = d.id
WHERE to_tsvector('german', d.content) @@ to_tsquery('german', $1)
ORDER BY ts_rank(to_tsvector('german', d.content), to_tsquery('german', $1)) DESC
LIMIT 10
`
query = strings.Join(strings.Split(strings.TrimSpace(query), " "), " | ")
rows, err := m.DB.Query(stmt, query)
if err != nil {
return nil, err
}
articleVMs := []*model.ArticleViewModel{}
for rows.Next() {
a := &model.ArticleViewModel{}
var sourceUrl string
err := rows.Scan(&a.Id, &a.Title, &sourceUrl, &a.PublishDate, &a.Summary)
if err != nil {
return nil, err
}
// summary
if a.Summary == "" {
a.Summary = "N/A"
}
// short url
parsedURL, err := url.Parse(sourceUrl)
if err == nil {
a.ShortSource = parsedURL.Hostname()
} else {
a.ShortSource = ""
}
// ai summary always false
a.AiSummarized = false
articleVMs = append(articleVMs, a)
}
if err = rows.Err(); err != nil {
return nil, err
}
return articleVMs, nil
}
// Will use the full-text search features of the underlying database to search // Will use the full-text search features of the underlying database to search
// articles for a given search query. This may fail if the connection to the // articles for a given search query. This may fail if the connection to the
// database fails. // database fails.
@@ -173,7 +224,6 @@ func (m *ArticleRepository) Insert(a *model.Article) error {
return err return err
} }
// TODO docstring
func (m *ArticleRepository) Update(a *model.Article) error { func (m *ArticleRepository) Update(a *model.Article) error {
stmt := `UPDATE articles stmt := `UPDATE articles
SET title = $1, sourceUrl = $2, content = $4, publishDate = $5, fetchDate = $6 SET title = $1, sourceUrl = $2, content = $4, publishDate = $5, fetchDate = $6

View File

@@ -0,0 +1,109 @@
package database
import (
"crowsnest/internal/model"
"database/sql"
)
type DocumentRepository struct {
DB *sql.DB
}
// Gets all the documents objects from the database. This may throw an error if
// the connection to the database fails.
func (d *DocumentRepository) All(limit int, offset int) ([]*model.Document, error) {
stmt := `
SELECT id, content, summary
FROM documents
LIMIT $1 OFFSET $2
`
rows, err := d.DB.Query(stmt, limit, offset)
if err != nil {
return nil, err
}
docs := []*model.Document{}
for rows.Next() {
d := model.Document{}
err := rows.Scan(&d.Id, &d.Content, &d.Summary)
if err != nil {
return nil, err
}
docs = append(docs, &d)
}
if err = rows.Err(); err != nil {
return nil, err
}
return docs, nil
}
// Will return an article given an id. This may fail if the connection to the
// database fails or there is no aritcle with the given id.
func (m *DocumentRepository) ById(id int) (*model.Document, error) {
stmt := `
SELECT id, content, summary
FROM documents
WHERE a.id = $1
`
rows := m.DB.QueryRow(stmt, id)
d := &model.Document{}
if err := rows.Scan(&d.Id, &d.Content, &d.Summary); err != nil {
return nil, err
}
return d, nil
}
// Counts all documents in the database. This may throw an error if the
// connection to the database fails.
func (d *DocumentRepository) CountAll() (uint, error) {
stmt := `SELECT count(id) FROM documents`
rows := d.DB.QueryRow(stmt)
count := uint(0)
if err := rows.Scan(&count); err != nil {
return 0, err
}
return count, nil
}
func (m *DocumentRepository) Update(d *model.Document) error {
stmt := `UPDATE documents
SET content = $1, summary = $2
WHERE id = $3
`
_, err := m.DB.Exec(stmt, d.Content, d.Summary, d.Id)
return err
}
func (d *DocumentRepository) Map(transform func(*model.Document) *model.Document) (int, error) {
processed := 0
count, err := d.CountAll()
if err != nil {
return processed, err
}
for i := 0; i < int(count); i += 10 {
docs, err := d.All(10, i)
if err != nil {
return processed, err
}
for _, doc := range docs {
new_doc := transform(doc)
err = d.Update(new_doc)
if err != nil { return processed, err }
processed++
}
}
return processed, nil
}

View File

@@ -0,0 +1,7 @@
package model
type Document struct {
Id int
Content string
Summary string
}

View File

@@ -0,0 +1,86 @@
package util
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
"os"
)
type Response struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
}
func Summarize(text string) (string, error) {
apiURL := "https://api.openai.com/v1/chat/completions"
apiKey := os.Getenv("OPENAI_API_KEY")
// Request payload
payload := map[string]interface{}{
"model": "gpt-4o-mini",
"messages": []map[string]string{
{
"role": "developer",
"content": "Fasse den folgenden Zeitungsartikel in maximal 75 Wörtern zusammen. Konzentriere dich auf die wichtigsten Informationen, wie das Hauptthema, die zentralen Aussagen und relevante Hintergründe. Gib **außschließlich** die Zusammenfassung zurück.",
},
{
"role": "user",
"content": text,
},
},
}
// Convert payload to JSON
jsonData, err := json.Marshal(payload)
if err != nil {
return "", err
}
// Create an HTTP request
req, err := http.NewRequest("POST", apiURL, bytes.NewBuffer(jsonData))
if err != nil {
return "", err
}
// Add headers
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", apiKey))
// Send the request
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
// Read the response
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "", err
}
// Unmarshal the JSON response
var response Response
err = json.Unmarshal(body, &response)
if err != nil {
return "", err
}
// Extract and print the content
var content string
if len(response.Choices) > 0 {
content = response.Choices[0].Message.Content
} else {
return "", errors.New("could not find content in response")
}
return content, nil
}