Initial support for auto-tagging

This commit is contained in:
Dominik Schröter 2024-09-23 17:03:14 +02:00
parent 47275e277a
commit 26a160209f
2 changed files with 154 additions and 8 deletions

120
main.go
View file

@ -57,6 +57,7 @@ type Document struct {
Content string `json:"content"` Content string `json:"content"`
Tags []int `json:"tags"` Tags []int `json:"tags"`
SuggestedTitle string `json:"suggested_title,omitempty"` SuggestedTitle string `json:"suggested_title,omitempty"`
SuggestedTags []string `json:"suggested_tags,omitempty"`
} }
var ( var (
@ -130,6 +131,46 @@ func createLLM() (llms.Model, error) {
} }
} }
func getAllTags(ctx context.Context, baseURL, apiToken string) (map[string]int, error) {
url := fmt.Sprintf("%s/api/tags/", baseURL)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("Authorization", fmt.Sprintf("Token %s", apiToken))
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("Error fetching tags: %d, %s", resp.StatusCode, string(bodyBytes))
}
var tagsResponse struct {
Results []struct {
ID int `json:"id"`
Name string `json:"name"`
} `json:"results"`
}
err = json.NewDecoder(resp.Body).Decode(&tagsResponse)
if err != nil {
return nil, err
}
tagIDMapping := make(map[string]int)
for _, tag := range tagsResponse.Results {
tagIDMapping[tag.Name] = tag.ID
}
return tagIDMapping, nil
}
// documentsHandler returns documents with the specific tag // documentsHandler returns documents with the specific tag
func documentsHandler(c *gin.Context) { func documentsHandler(c *gin.Context) {
ctx := c.Request.Context() ctx := c.Request.Context()
@ -290,6 +331,18 @@ func processDocuments(ctx context.Context, documents []Document) ([]Document, er
return nil, fmt.Errorf("failed to create LLM client: %v", err) return nil, fmt.Errorf("failed to create LLM client: %v", err)
} }
// Fetch all available tags from paperless-ngx
availableTags, err := getAllTags(ctx, paperlessBaseURL, paperlessAPIToken)
if err != nil {
return nil, fmt.Errorf("failed to fetch available tags: %v", err)
}
// Prepare a list of tag names
availableTagNames := make([]string, 0, len(availableTags))
for tagName := range availableTags {
availableTagNames = append(availableTagNames, tagName)
}
var wg sync.WaitGroup var wg sync.WaitGroup
var mu sync.Mutex var mu sync.Mutex
errors := make([]error, 0) errors := make([]error, 0)
@ -315,8 +368,18 @@ func processDocuments(ctx context.Context, documents []Document) ([]Document, er
return return
} }
suggestedTags, err := getSuggestedTags(ctx, llm, content, suggestedTitle, availableTagNames)
if err != nil {
mu.Lock()
errors = append(errors, fmt.Errorf("Document %d: %v", documentID, err))
mu.Unlock()
log.Printf("Error generating tags for document %d: %v", documentID, err)
return
}
mu.Lock() mu.Lock()
doc.SuggestedTitle = suggestedTitle doc.SuggestedTitle = suggestedTitle
doc.SuggestedTags = suggestedTags
mu.Unlock() mu.Unlock()
log.Printf("Document %d processed successfully.", documentID) log.Printf("Document %d processed successfully.", documentID)
}(&documents[i]) }(&documents[i])
@ -331,6 +394,47 @@ func processDocuments(ctx context.Context, documents []Document) ([]Document, er
return documents, nil return documents, nil
} }
func getSuggestedTags(ctx context.Context, llm llms.Model, content string, suggestedTitle string, availableTags []string) ([]string, error) {
likelyLanguage := os.Getenv("LLM_LANGUAGE")
if likelyLanguage == "" {
likelyLanguage = "English"
}
prompt := fmt.Sprintf(`I will provide you with the content and suggested title of a document. Your task is to select appropriate tags for the document from the list of available tags I will provide. Only select tags from the provided list. Respond only with the selected tags as a comma-separated list, without any additional information. The content is likely in %s.
Available Tags:
%s
Suggested Title:
%s
Content:
%s
`, likelyLanguage, strings.Join(availableTags, ", "), suggestedTitle, content)
completion, err := llm.GenerateContent(ctx, []llms.MessageContent{
{
Parts: []llms.ContentPart{
llms.TextContent{
Text: prompt,
},
},
Role: llms.ChatMessageTypeHuman,
},
})
if err != nil {
return nil, fmt.Errorf("Error getting response from LLM: %v", err)
}
response := strings.TrimSpace(completion.Choices[0].Content)
suggestedTags := strings.Split(response, ",")
for i, tag := range suggestedTags {
suggestedTags[i] = strings.TrimSpace(tag)
}
return suggestedTags, nil
}
func getSuggestedTitle(ctx context.Context, llm llms.Model, content string) (string, error) { func getSuggestedTitle(ctx context.Context, llm llms.Model, content string) (string, error) {
likelyLanguage, ok := os.LookupEnv("LLM_LANGUAGE") likelyLanguage, ok := os.LookupEnv("LLM_LANGUAGE")
if !ok { if !ok {
@ -366,6 +470,13 @@ Content:
func updateDocuments(ctx context.Context, baseURL, apiToken string, documents []Document, paperlessGptTagID int) error { func updateDocuments(ctx context.Context, baseURL, apiToken string, documents []Document, paperlessGptTagID int) error {
client := &http.Client{} client := &http.Client{}
// Fetch all available tags
availableTags, err := getAllTags(ctx, baseURL, apiToken)
if err != nil {
log.Printf("Error fetching available tags: %v", err)
return err
}
for _, document := range documents { for _, document := range documents {
documentID := document.ID documentID := document.ID
@ -378,6 +489,15 @@ func updateDocuments(ctx context.Context, baseURL, apiToken string, documents []
} }
} }
// Map suggested tag names to IDs
for _, tagName := range document.SuggestedTags {
if tagID, exists := availableTags[tagName]; exists {
newTags = append(newTags, tagID)
} else {
log.Printf("Tag '%s' does not exist in paperless-ngx, skipping.", tagName)
}
}
updatedFields["tags"] = newTags updatedFields["tags"] = newTags
suggestedTitle := document.SuggestedTitle suggestedTitle := document.SuggestedTitle

View file

@ -13,6 +13,7 @@ interface Document {
title: string; title: string;
content: string; content: string;
suggested_title?: string; suggested_title?: string;
suggested_tags?: string[];
} }
const DocumentProcessor: React.FC = () => { const DocumentProcessor: React.FC = () => {
@ -30,7 +31,7 @@ const DocumentProcessor: React.FC = () => {
} catch (error) { } catch (error) {
console.error("Error fetching filter tag:", error); console.error("Error fetching filter tag:", error);
} }
} };
const fetchDocuments = async () => { const fetchDocuments = async () => {
try { try {
@ -109,8 +110,10 @@ const DocumentProcessor: React.FC = () => {
<div className="flex items-center justify-center h-screen"> <div className="flex items-center justify-center h-screen">
<div className="text-xl font-semibold"> <div className="text-xl font-semibold">
No documents found with filter tag{" "} No documents found with filter tag{" "}
<span className="bg-blue-100 text-blue-800 text-sm font-medium me-2 px-2.5 py-0.5 rounded dark:bg-blue-900 dark:text-blue-300bg-blue-100 text-blue-800 text-xs font-medium me-2 px-2.5 py-0.5 rounded-full dark:bg-blue-900 dark:text-blue-300">{filterTag}</span> <span className="bg-blue-100 text-blue-800 text-sm font-medium me-2 px-2.5 py-0.5 rounded dark:bg-blue-900 dark:text-blue-300bg-blue-100 text-blue-800 text-xs font-medium me-2 px-2.5 py-0.5 rounded-full dark:bg-blue-900 dark:text-blue-300">
{" "}found. Try{" "} {filterTag}
</span>{" "}
found. Try{" "}
<button <button
onClick={() => { onClick={() => {
setDocuments([]); setDocuments([]);
@ -199,6 +202,9 @@ const DocumentProcessor: React.FC = () => {
<th className="px-4 py-2 text-left text-sm font-medium text-gray-500"> <th className="px-4 py-2 text-left text-sm font-medium text-gray-500">
Suggested Title Suggested Title
</th> </th>
<th className="px-4 py-2 text-left text-sm font-medium text-gray-500">
Suggested Tags
</th>
</tr> </tr>
</thead> </thead>
<tbody className="bg-white divide-y divide-gray-200"> <tbody className="bg-white divide-y divide-gray-200">
@ -227,6 +233,26 @@ const DocumentProcessor: React.FC = () => {
className="w-full border border-gray-300 rounded px-2 py-1 focus:outline-none focus:ring-2 focus:ring-blue-500" className="w-full border border-gray-300 rounded px-2 py-1 focus:outline-none focus:ring-2 focus:ring-blue-500"
/> />
</td> </td>
<td className="px-4 py-3 text-sm text-gray-900">
<input
type="text"
value={doc.suggested_tags?.join(", ")}
onChange={(e) => {
const updatedDocuments = documents.map((d) =>
d.id === doc.id
? {
...d,
suggested_tags: e.target.value
.split(",")
.map((tag) => tag.trim()),
}
: d
);
setDocuments(updatedDocuments);
}}
className="w-full border border-gray-300 rounded px-2 py-1 focus:outline-none focus:ring-2 focus:ring-blue-500"
/>
</td>
</tr> </tr>
) )
)} )}