add summarization for documents

This commit is contained in:
2025-01-20 20:34:23 +01:00
parent 0594fb8aeb
commit 637a5ebb0c
6 changed files with 270 additions and 9 deletions

View File

@@ -6,6 +6,7 @@ import (
"crowsnest/internal/middleware"
"crowsnest/internal/model"
"crowsnest/internal/model/database"
"crowsnest/internal/util"
"log"
"net/http"
@@ -18,6 +19,21 @@ func main() {
log.Fatal("failed to connect to database due to", err.Error())
}
// summarize documents
documents := &database.DocumentRepository{DB: db}
go documents.Map(func(doc *model.Document) *model.Document {
if doc.Summary == "" {
summaryText, err := util.Summarize(doc.Content)
if err == nil {
doc.Summary = summaryText
return doc
}
log.Println(err.Error())
}
return doc
})
// run web crawlers
articles := &database.ArticleRepository{DB: db}
crawler := crawler.CrawlerFacade{}

View File

@@ -1,7 +1,6 @@
package app
import (
"crowsnest/internal/model"
"html/template"
"net/http"
)
@@ -17,19 +16,13 @@ func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) {
}
// get articles
articles, err := app.articles.Search(searchTerms)
articleVMs, err := app.articles.SearchArticleViewModel(searchTerms)
if err != nil {
// treat as no result
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// convert to viewmodel
articleVMs := make([]*model.ArticleViewModel, 0, len(articles))
for _, a := range articles {
articleVMs = append(articleVMs, a.ViewModel())
}
// render template
t := template.Must(template.ParseFiles(
"assets/templates/article.html",

View File

@@ -107,6 +107,57 @@ func (m *ArticleRepository) CountAll() (uint, error) {
return count, nil
}
// Will use the full-text search features of the underlying database to search
// articles for a given search query. This may fail if the connection to the
// database fails.
func (m *ArticleRepository) SearchArticleViewModel(query string) ([]*model.ArticleViewModel, error) {
stmt := `
SELECT a.id, a.title, a.sourceUrl, a.publishDate, d.summary
FROM articles a JOIN documents d ON a.document_id = d.id
WHERE to_tsvector('german', d.content) @@ to_tsquery('german', $1)
ORDER BY ts_rank(to_tsvector('german', d.content), to_tsquery('german', $1)) DESC
LIMIT 10
`
query = strings.Join(strings.Split(strings.TrimSpace(query), " "), " | ")
rows, err := m.DB.Query(stmt, query)
if err != nil {
return nil, err
}
articleVMs := []*model.ArticleViewModel{}
for rows.Next() {
a := &model.ArticleViewModel{}
var sourceUrl string
err := rows.Scan(&a.Id, &a.Title, &sourceUrl, &a.PublishDate, &a.Summary)
if err != nil {
return nil, err
}
// summary
if a.Summary == "" {
a.Summary = "N/A"
}
// short url
parsedURL, err := url.Parse(sourceUrl)
if err == nil {
a.ShortSource = parsedURL.Hostname()
} else {
a.ShortSource = ""
}
// ai summary always false
a.AiSummarized = false
articleVMs = append(articleVMs, a)
}
if err = rows.Err(); err != nil {
return nil, err
}
return articleVMs, nil
}
// Will use the full-text search features of the underlying database to search
// articles for a given search query. This may fail if the connection to the
// database fails.
@@ -173,7 +224,6 @@ func (m *ArticleRepository) Insert(a *model.Article) error {
return err
}
// TODO docstring
func (m *ArticleRepository) Update(a *model.Article) error {
stmt := `UPDATE articles
SET title = $1, sourceUrl = $2, content = $4, publishDate = $5, fetchDate = $6

View File

@@ -0,0 +1,109 @@
package database
import (
"crowsnest/internal/model"
"database/sql"
)
type DocumentRepository struct {
DB *sql.DB
}
// Gets all the documents objects from the database. This may throw an error if
// the connection to the database fails.
func (d *DocumentRepository) All(limit int, offset int) ([]*model.Document, error) {
stmt := `
SELECT id, content, summary
FROM documents
LIMIT $1 OFFSET $2
`
rows, err := d.DB.Query(stmt, limit, offset)
if err != nil {
return nil, err
}
docs := []*model.Document{}
for rows.Next() {
d := model.Document{}
err := rows.Scan(&d.Id, &d.Content, &d.Summary)
if err != nil {
return nil, err
}
docs = append(docs, &d)
}
if err = rows.Err(); err != nil {
return nil, err
}
return docs, nil
}
// Will return an article given an id. This may fail if the connection to the
// database fails or there is no aritcle with the given id.
func (m *DocumentRepository) ById(id int) (*model.Document, error) {
stmt := `
SELECT id, content, summary
FROM documents
WHERE a.id = $1
`
rows := m.DB.QueryRow(stmt, id)
d := &model.Document{}
if err := rows.Scan(&d.Id, &d.Content, &d.Summary); err != nil {
return nil, err
}
return d, nil
}
// Counts all documents in the database. This may throw an error if the
// connection to the database fails.
func (d *DocumentRepository) CountAll() (uint, error) {
stmt := `SELECT count(id) FROM documents`
rows := d.DB.QueryRow(stmt)
count := uint(0)
if err := rows.Scan(&count); err != nil {
return 0, err
}
return count, nil
}
func (m *DocumentRepository) Update(d *model.Document) error {
stmt := `UPDATE documents
SET content = $1, summary = $2
WHERE id = $3
`
_, err := m.DB.Exec(stmt, d.Content, d.Summary, d.Id)
return err
}
func (d *DocumentRepository) Map(transform func(*model.Document) *model.Document) (int, error) {
processed := 0
count, err := d.CountAll()
if err != nil {
return processed, err
}
for i := 0; i < int(count); i += 10 {
docs, err := d.All(10, i)
if err != nil {
return processed, err
}
for _, doc := range docs {
new_doc := transform(doc)
err = d.Update(new_doc)
if err != nil { return processed, err }
processed++
}
}
return processed, nil
}

View File

@@ -0,0 +1,7 @@
package model
type Document struct {
Id int
Content string
Summary string
}

View File

@@ -0,0 +1,86 @@
package util
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net/http"
"os"
)
type Response struct {
Choices []struct {
Message struct {
Content string `json:"content"`
} `json:"message"`
} `json:"choices"`
}
func Summarize(text string) (string, error) {
apiURL := "https://api.openai.com/v1/chat/completions"
apiKey := os.Getenv("OPENAI_API_KEY")
// Request payload
payload := map[string]interface{}{
"model": "gpt-4o-mini",
"messages": []map[string]string{
{
"role": "developer",
"content": "Fasse den folgenden Zeitungsartikel in maximal 75 Wörtern zusammen. Konzentriere dich auf die wichtigsten Informationen, wie das Hauptthema, die zentralen Aussagen und relevante Hintergründe. Gib **außschließlich** die Zusammenfassung zurück.",
},
{
"role": "user",
"content": text,
},
},
}
// Convert payload to JSON
jsonData, err := json.Marshal(payload)
if err != nil {
return "", err
}
// Create an HTTP request
req, err := http.NewRequest("POST", apiURL, bytes.NewBuffer(jsonData))
if err != nil {
return "", err
}
// Add headers
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", apiKey))
// Send the request
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
// Read the response
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "", err
}
// Unmarshal the JSON response
var response Response
err = json.Unmarshal(body, &response)
if err != nil {
return "", err
}
// Extract and print the content
var content string
if len(response.Choices) > 0 {
content = response.Choices[0].Message.Content
} else {
return "", errors.New("could not find content in response")
}
return content, nil
}