add summarization for documents
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
||||
"crowsnest/internal/middleware"
|
||||
"crowsnest/internal/model"
|
||||
"crowsnest/internal/model/database"
|
||||
"crowsnest/internal/util"
|
||||
"log"
|
||||
"net/http"
|
||||
|
||||
@@ -18,6 +19,21 @@ func main() {
|
||||
log.Fatal("failed to connect to database due to", err.Error())
|
||||
}
|
||||
|
||||
// summarize documents
|
||||
documents := &database.DocumentRepository{DB: db}
|
||||
|
||||
go documents.Map(func(doc *model.Document) *model.Document {
|
||||
if doc.Summary == "" {
|
||||
summaryText, err := util.Summarize(doc.Content)
|
||||
if err == nil {
|
||||
doc.Summary = summaryText
|
||||
return doc
|
||||
}
|
||||
log.Println(err.Error())
|
||||
}
|
||||
return doc
|
||||
})
|
||||
|
||||
// run web crawlers
|
||||
articles := &database.ArticleRepository{DB: db}
|
||||
crawler := crawler.CrawlerFacade{}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"crowsnest/internal/model"
|
||||
"html/template"
|
||||
"net/http"
|
||||
)
|
||||
@@ -17,19 +16,13 @@ func (app *App) UpSearch(w http.ResponseWriter, req *http.Request) {
|
||||
}
|
||||
|
||||
// get articles
|
||||
articles, err := app.articles.Search(searchTerms)
|
||||
articleVMs, err := app.articles.SearchArticleViewModel(searchTerms)
|
||||
if err != nil {
|
||||
// treat as no result
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// convert to viewmodel
|
||||
articleVMs := make([]*model.ArticleViewModel, 0, len(articles))
|
||||
for _, a := range articles {
|
||||
articleVMs = append(articleVMs, a.ViewModel())
|
||||
}
|
||||
|
||||
// render template
|
||||
t := template.Must(template.ParseFiles(
|
||||
"assets/templates/article.html",
|
||||
|
||||
@@ -107,6 +107,57 @@ func (m *ArticleRepository) CountAll() (uint, error) {
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// Will use the full-text search features of the underlying database to search
|
||||
// articles for a given search query. This may fail if the connection to the
|
||||
// database fails.
|
||||
func (m *ArticleRepository) SearchArticleViewModel(query string) ([]*model.ArticleViewModel, error) {
|
||||
stmt := `
|
||||
SELECT a.id, a.title, a.sourceUrl, a.publishDate, d.summary
|
||||
FROM articles a JOIN documents d ON a.document_id = d.id
|
||||
WHERE to_tsvector('german', d.content) @@ to_tsquery('german', $1)
|
||||
ORDER BY ts_rank(to_tsvector('german', d.content), to_tsquery('german', $1)) DESC
|
||||
LIMIT 10
|
||||
`
|
||||
|
||||
query = strings.Join(strings.Split(strings.TrimSpace(query), " "), " | ")
|
||||
rows, err := m.DB.Query(stmt, query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
articleVMs := []*model.ArticleViewModel{}
|
||||
for rows.Next() {
|
||||
a := &model.ArticleViewModel{}
|
||||
var sourceUrl string
|
||||
err := rows.Scan(&a.Id, &a.Title, &sourceUrl, &a.PublishDate, &a.Summary)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// summary
|
||||
if a.Summary == "" {
|
||||
a.Summary = "N/A"
|
||||
}
|
||||
|
||||
// short url
|
||||
parsedURL, err := url.Parse(sourceUrl)
|
||||
if err == nil {
|
||||
a.ShortSource = parsedURL.Hostname()
|
||||
} else {
|
||||
a.ShortSource = ""
|
||||
}
|
||||
|
||||
// ai summary always false
|
||||
a.AiSummarized = false
|
||||
|
||||
articleVMs = append(articleVMs, a)
|
||||
}
|
||||
|
||||
if err = rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return articleVMs, nil
|
||||
}
|
||||
|
||||
// Will use the full-text search features of the underlying database to search
|
||||
// articles for a given search query. This may fail if the connection to the
|
||||
// database fails.
|
||||
@@ -173,7 +224,6 @@ func (m *ArticleRepository) Insert(a *model.Article) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// TODO docstring
|
||||
func (m *ArticleRepository) Update(a *model.Article) error {
|
||||
stmt := `UPDATE articles
|
||||
SET title = $1, sourceUrl = $2, content = $4, publishDate = $5, fetchDate = $6
|
||||
|
||||
109
src/internal/model/database/documentRepository.go
Normal file
109
src/internal/model/database/documentRepository.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package database
|
||||
|
||||
import (
|
||||
"crowsnest/internal/model"
|
||||
"database/sql"
|
||||
)
|
||||
|
||||
type DocumentRepository struct {
|
||||
DB *sql.DB
|
||||
}
|
||||
|
||||
// Gets all the documents objects from the database. This may throw an error if
|
||||
// the connection to the database fails.
|
||||
func (d *DocumentRepository) All(limit int, offset int) ([]*model.Document, error) {
|
||||
stmt := `
|
||||
SELECT id, content, summary
|
||||
FROM documents
|
||||
LIMIT $1 OFFSET $2
|
||||
`
|
||||
rows, err := d.DB.Query(stmt, limit, offset)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := []*model.Document{}
|
||||
for rows.Next() {
|
||||
d := model.Document{}
|
||||
err := rows.Scan(&d.Id, &d.Content, &d.Summary)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs = append(docs, &d)
|
||||
}
|
||||
|
||||
if err = rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// Will return an article given an id. This may fail if the connection to the
|
||||
// database fails or there is no aritcle with the given id.
|
||||
func (m *DocumentRepository) ById(id int) (*model.Document, error) {
|
||||
stmt := `
|
||||
SELECT id, content, summary
|
||||
FROM documents
|
||||
WHERE a.id = $1
|
||||
`
|
||||
|
||||
rows := m.DB.QueryRow(stmt, id)
|
||||
|
||||
d := &model.Document{}
|
||||
if err := rows.Scan(&d.Id, &d.Content, &d.Summary); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// Counts all documents in the database. This may throw an error if the
|
||||
// connection to the database fails.
|
||||
func (d *DocumentRepository) CountAll() (uint, error) {
|
||||
stmt := `SELECT count(id) FROM documents`
|
||||
|
||||
rows := d.DB.QueryRow(stmt)
|
||||
|
||||
count := uint(0)
|
||||
if err := rows.Scan(&count); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (m *DocumentRepository) Update(d *model.Document) error {
|
||||
stmt := `UPDATE documents
|
||||
SET content = $1, summary = $2
|
||||
WHERE id = $3
|
||||
`
|
||||
_, err := m.DB.Exec(stmt, d.Content, d.Summary, d.Id)
|
||||
return err
|
||||
}
|
||||
|
||||
func (d *DocumentRepository) Map(transform func(*model.Document) *model.Document) (int, error) {
|
||||
processed := 0
|
||||
|
||||
count, err := d.CountAll()
|
||||
if err != nil {
|
||||
return processed, err
|
||||
}
|
||||
|
||||
for i := 0; i < int(count); i += 10 {
|
||||
docs, err := d.All(10, i)
|
||||
if err != nil {
|
||||
return processed, err
|
||||
}
|
||||
|
||||
for _, doc := range docs {
|
||||
new_doc := transform(doc)
|
||||
err = d.Update(new_doc)
|
||||
if err != nil { return processed, err }
|
||||
processed++
|
||||
}
|
||||
}
|
||||
|
||||
return processed, nil
|
||||
}
|
||||
7
src/internal/model/document.go
Normal file
7
src/internal/model/document.go
Normal file
@@ -0,0 +1,7 @@
|
||||
package model
|
||||
|
||||
type Document struct {
|
||||
Id int
|
||||
Content string
|
||||
Summary string
|
||||
}
|
||||
86
src/internal/util/summary.go
Normal file
86
src/internal/util/summary.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package util
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"os"
|
||||
)
|
||||
|
||||
type Response struct {
|
||||
Choices []struct {
|
||||
Message struct {
|
||||
Content string `json:"content"`
|
||||
} `json:"message"`
|
||||
} `json:"choices"`
|
||||
}
|
||||
|
||||
func Summarize(text string) (string, error) {
|
||||
apiURL := "https://api.openai.com/v1/chat/completions"
|
||||
apiKey := os.Getenv("OPENAI_API_KEY")
|
||||
|
||||
// Request payload
|
||||
payload := map[string]interface{}{
|
||||
"model": "gpt-4o-mini",
|
||||
"messages": []map[string]string{
|
||||
{
|
||||
"role": "developer",
|
||||
"content": "Fasse den folgenden Zeitungsartikel in maximal 75 Wörtern zusammen. Konzentriere dich auf die wichtigsten Informationen, wie das Hauptthema, die zentralen Aussagen und relevante Hintergründe. Gib **außschließlich** die Zusammenfassung zurück.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": text,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Convert payload to JSON
|
||||
jsonData, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Create an HTTP request
|
||||
req, err := http.NewRequest("POST", apiURL, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Add headers
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", apiKey))
|
||||
|
||||
// Send the request
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Read the response
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Unmarshal the JSON response
|
||||
var response Response
|
||||
err = json.Unmarshal(body, &response)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Extract and print the content
|
||||
var content string
|
||||
if len(response.Choices) > 0 {
|
||||
content = response.Choices[0].Message.Content
|
||||
} else {
|
||||
return "", errors.New("could not find content in response")
|
||||
}
|
||||
|
||||
return content, nil
|
||||
}
|
||||
Reference in New Issue
Block a user