From 637a5ebb0ccbde0ce5cf0e26be7d6c8396019b11 Mon Sep 17 00:00:00 2001 From: Elias Kohout Date: Mon, 20 Jan 2025 20:34:23 +0100 Subject: [PATCH] add summarization for documents --- src/cmd/frontend/main.go | 16 +++ src/internal/app/UpSearch.go | 9 +- .../model/database/articleRepository.go | 52 ++++++++- .../model/database/documentRepository.go | 109 ++++++++++++++++++ src/internal/model/document.go | 7 ++ src/internal/util/summary.go | 86 ++++++++++++++ 6 files changed, 270 insertions(+), 9 deletions(-) create mode 100644 src/internal/model/database/documentRepository.go create mode 100644 src/internal/model/document.go create mode 100644 src/internal/util/summary.go diff --git a/src/cmd/frontend/main.go b/src/cmd/frontend/main.go index cca795f..8fc7f25 100644 --- a/src/cmd/frontend/main.go +++ b/src/cmd/frontend/main.go @@ -6,6 +6,7 @@ import ( "crowsnest/internal/middleware" "crowsnest/internal/model" "crowsnest/internal/model/database" + "crowsnest/internal/util" "log" "net/http" @@ -18,6 +19,21 @@ func main() { log.Fatal("failed to connect to database due to", err.Error()) } + // summarize documents + documents := &database.DocumentRepository{DB: db} + + go documents.Map(func(doc *model.Document) *model.Document { + if doc.Summary == "" { + summaryText, err := util.Summarize(doc.Content) + if err == nil { + doc.Summary = summaryText + return doc + } + log.Println(err.Error()) + } + return doc + }) + // run web crawlers articles := &database.ArticleRepository{DB: db} crawler := crawler.CrawlerFacade{} diff --git a/src/internal/app/UpSearch.go b/src/internal/app/UpSearch.go index f72cc4c..d6a21ed 100644 --- a/src/internal/app/UpSearch.go +++ b/src/internal/app/UpSearch.go @@ -1,7 +1,6 @@ package app import ( - "crowsnest/internal/model" "html/template" "net/http" ) @@ -17,19 +16,13 @@ func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) { } // get articles - articles, err := app.articles.Search(searchTerms) + articleVMs, err := app.articles.SearchArticleViewModel(searchTerms) if err != nil { // treat as no result http.Error(w, err.Error(), http.StatusInternalServerError) return } - // convert to viewmodel - articleVMs := make([]*model.ArticleViewModel, 0, len(articles)) - for _, a := range articles { - articleVMs = append(articleVMs, a.ViewModel()) - } - // render template t := template.Must(template.ParseFiles( "assets/templates/article.html", diff --git a/src/internal/model/database/articleRepository.go b/src/internal/model/database/articleRepository.go index 920b3ed..c2ee7a2 100644 --- a/src/internal/model/database/articleRepository.go +++ b/src/internal/model/database/articleRepository.go @@ -107,6 +107,57 @@ func (m *ArticleRepository) CountAll() (uint, error) { return count, nil } +// Will use the full-text search features of the underlying database to search +// articles for a given search query. This may fail if the connection to the +// database fails. +func (m *ArticleRepository) SearchArticleViewModel(query string) ([]*model.ArticleViewModel, error) { + stmt := ` + SELECT a.id, a.title, a.sourceUrl, a.publishDate, d.summary + FROM articles a JOIN documents d ON a.document_id = d.id + WHERE to_tsvector('german', d.content) @@ to_tsquery('german', $1) + ORDER BY ts_rank(to_tsvector('german', d.content), to_tsquery('german', $1)) DESC + LIMIT 10 + ` + + query = strings.Join(strings.Split(strings.TrimSpace(query), " "), " | ") + rows, err := m.DB.Query(stmt, query) + if err != nil { + return nil, err + } + + articleVMs := []*model.ArticleViewModel{} + for rows.Next() { + a := &model.ArticleViewModel{} + var sourceUrl string + err := rows.Scan(&a.Id, &a.Title, &sourceUrl, &a.PublishDate, &a.Summary) + if err != nil { + return nil, err + } + // summary + if a.Summary == "" { + a.Summary = "N/A" + } + + // short url + parsedURL, err := url.Parse(sourceUrl) + if err == nil { + a.ShortSource = parsedURL.Hostname() + } else { + a.ShortSource = "" + } + + // ai summary always false + a.AiSummarized = false + + articleVMs = append(articleVMs, a) + } + + if err = rows.Err(); err != nil { + return nil, err + } + return articleVMs, nil +} + // Will use the full-text search features of the underlying database to search // articles for a given search query. This may fail if the connection to the // database fails. @@ -173,7 +224,6 @@ func (m *ArticleRepository) Insert(a *model.Article) error { return err } -// TODO docstring func (m *ArticleRepository) Update(a *model.Article) error { stmt := `UPDATE articles SET title = $1, sourceUrl = $2, content = $4, publishDate = $5, fetchDate = $6 diff --git a/src/internal/model/database/documentRepository.go b/src/internal/model/database/documentRepository.go new file mode 100644 index 0000000..944148d --- /dev/null +++ b/src/internal/model/database/documentRepository.go @@ -0,0 +1,109 @@ +package database + +import ( + "crowsnest/internal/model" + "database/sql" +) + +type DocumentRepository struct { + DB *sql.DB +} + +// Gets all the documents objects from the database. This may throw an error if +// the connection to the database fails. +func (d *DocumentRepository) All(limit int, offset int) ([]*model.Document, error) { + stmt := ` + SELECT id, content, summary + FROM documents + LIMIT $1 OFFSET $2 + ` + rows, err := d.DB.Query(stmt, limit, offset) + if err != nil { + return nil, err + } + + docs := []*model.Document{} + for rows.Next() { + d := model.Document{} + err := rows.Scan(&d.Id, &d.Content, &d.Summary) + if err != nil { + return nil, err + } + + docs = append(docs, &d) + } + + if err = rows.Err(); err != nil { + return nil, err + } + + return docs, nil +} + +// Will return an article given an id. This may fail if the connection to the +// database fails or there is no aritcle with the given id. +func (m *DocumentRepository) ById(id int) (*model.Document, error) { + stmt := ` + SELECT id, content, summary + FROM documents + WHERE a.id = $1 + ` + + rows := m.DB.QueryRow(stmt, id) + + d := &model.Document{} + if err := rows.Scan(&d.Id, &d.Content, &d.Summary); err != nil { + return nil, err + } + + return d, nil +} + +// Counts all documents in the database. This may throw an error if the +// connection to the database fails. +func (d *DocumentRepository) CountAll() (uint, error) { + stmt := `SELECT count(id) FROM documents` + + rows := d.DB.QueryRow(stmt) + + count := uint(0) + if err := rows.Scan(&count); err != nil { + return 0, err + } + + return count, nil +} + +func (m *DocumentRepository) Update(d *model.Document) error { + stmt := `UPDATE documents + SET content = $1, summary = $2 + WHERE id = $3 + ` + _, err := m.DB.Exec(stmt, d.Content, d.Summary, d.Id) + return err +} + +func (d *DocumentRepository) Map(transform func(*model.Document) *model.Document) (int, error) { + processed := 0 + + count, err := d.CountAll() + if err != nil { + return processed, err + } + + for i := 0; i < int(count); i += 10 { + docs, err := d.All(10, i) + if err != nil { + return processed, err + } + + for _, doc := range docs { + new_doc := transform(doc) + err = d.Update(new_doc) + if err != nil { return processed, err } + processed++ + } + } + + return processed, nil +} diff --git a/src/internal/model/document.go b/src/internal/model/document.go new file mode 100644 index 0000000..9f9f935 --- /dev/null +++ b/src/internal/model/document.go @@ -0,0 +1,7 @@ +package model + +type Document struct { + Id int + Content string + Summary string +} diff --git a/src/internal/util/summary.go b/src/internal/util/summary.go new file mode 100644 index 0000000..a6220f8 --- /dev/null +++ b/src/internal/util/summary.go @@ -0,0 +1,86 @@ +package util + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "net/http" + "os" +) + +type Response struct { + Choices []struct { + Message struct { + Content string `json:"content"` + } `json:"message"` + } `json:"choices"` +} + +func Summarize(text string) (string, error) { + apiURL := "https://api.openai.com/v1/chat/completions" + apiKey := os.Getenv("OPENAI_API_KEY") + + // Request payload + payload := map[string]interface{}{ + "model": "gpt-4o-mini", + "messages": []map[string]string{ + { + "role": "developer", + "content": "Fasse den folgenden Zeitungsartikel in maximal 75 Wörtern zusammen. Konzentriere dich auf die wichtigsten Informationen, wie das Hauptthema, die zentralen Aussagen und relevante Hintergründe. Gib **außschließlich** die Zusammenfassung zurück.", + }, + { + "role": "user", + "content": text, + }, + }, + } + + // Convert payload to JSON + jsonData, err := json.Marshal(payload) + if err != nil { + return "", err + } + + // Create an HTTP request + req, err := http.NewRequest("POST", apiURL, bytes.NewBuffer(jsonData)) + if err != nil { + return "", err + } + + // Add headers + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", apiKey)) + + // Send the request + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + // Read the response + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return "", err + } + + // Unmarshal the JSON response + var response Response + err = json.Unmarshal(body, &response) + if err != nil { + return "", err + } + + // Extract and print the content + var content string + if len(response.Choices) > 0 { + content = response.Choices[0].Message.Content + } else { + return "", errors.New("could not find content in response") + } + + return content, nil +}