feat: add support automatic OCR (#75)

This commit is contained in:
Icereed 2025-01-06 23:03:41 +01:00 committed by GitHub
parent 99ad4883e8
commit 32f83ec93f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 136 additions and 37 deletions

View file

@ -71,6 +71,8 @@ services:
PAPERLESS_BASE_URL: 'http://paperless-ngx:8000'
PAPERLESS_API_TOKEN: 'your_paperless_api_token'
PAPERLESS_PUBLIC_URL: 'http://paperless.mydomain.com' # Optional, your public link to access Paperless
MANUAL_TAG: 'paperless-gpt' # Optional, default is 'paperless-gpt'
AUTO_TAG: 'paperless-gpt-auto' # Optional, default is 'paperless-gpt-auto'
LLM_PROVIDER: 'openai' # or 'ollama'
LLM_MODEL: 'gpt-4o' # or 'llama2'
OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
@ -78,6 +80,7 @@ services:
OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai
VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default is 'paperless-gpt-ocr-auto'
LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
LISTEN_INTERFACE: '127.0.0.1:8080' # Optional, default is ':8080'
WEBUI_PATH: '/usr/share/paperless-gpt/webui' # Optional, default is './web-app/dist'
@ -141,6 +144,8 @@ If you prefer to run the application manually:
| `PAPERLESS_BASE_URL` | The base URL of your paperless-ngx instance (e.g., `http://paperless-ngx:8000`). | Yes |
| `PAPERLESS_API_TOKEN` | API token for accessing paperless-ngx. You can generate one in the paperless-ngx admin interface. | Yes |
| `PAPERLESS_PUBLIC_URL` | The public URL for your Paperless instance, if it is different to your `PAPERLESS_BASE_URL` - say if you are running in Docker Compose | No |
| `MANUAL_TAG` | The tag to use for manually processing documents. Default is `paperless-gpt`. | No |
| `AUTO_TAG` | The tag to use for automatically processing documents. Default is `paperless-gpt-auto`. | No |
| `LLM_PROVIDER` | The LLM provider to use (`openai` or `ollama`). | Yes |
| `LLM_MODEL` | The model name to use (e.g., `gpt-4o`, `gpt-3.5-turbo`, `llama2`). | Yes |
| `OPENAI_API_KEY` | Your OpenAI API key. Required if using OpenAI as the LLM provider. | Cond. |
@ -148,6 +153,7 @@ If you prefer to run the application manually:
| `OLLAMA_HOST` | The URL of the Ollama server (e.g., `http://host.docker.internal:11434`). Useful if using Ollama. Default is `http://127.0.0.1:11434`. | No |
| `VISION_LLM_PROVIDER` | The vision LLM provider to use for OCR (`openai` or `ollama`). | No |
| `VISION_LLM_MODEL` | The model name to use for OCR (e.g., `minicpm-v`). | No |
| `AUTO_OCR_TAG` | The tag to use for automatically processing documents with OCR. Default is `paperless-gpt-ocr-auto`. | No |
| `LOG_LEVEL` | The log level for the application (`info`, `debug`, `warn`, `error`). Default is `info`. | No |
| `LISTEN_INTERFACE` | The interface paperless-gpt listens to. Default is `:8080` | No |
| `WEBUI_PATH` | The path to load static content from. Default is `./web-app/dist` | No |

33
jobs.go
View file

@ -2,10 +2,8 @@ package main
import (
"context"
"fmt"
"os"
"sort"
"strings"
"sync"
"time"
@ -125,38 +123,13 @@ func processJob(app *App, job *Job) {
ctx := context.Background()
// Download images of the document
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, job.DocumentID)
fullOcrText, err := app.ProcessDocumentOCR(ctx, job.DocumentID)
if err != nil {
logger.Infof("Error downloading document images for job %s: %v", job.ID, err)
jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error downloading document images: %v", err))
logger.Errorf("Error processing document OCR for job %s: %v", job.ID, err)
jobStore.updateJobStatus(job.ID, "failed", err.Error())
return
}
var ocrTexts []string
for i, imagePath := range imagePaths {
imageContent, err := os.ReadFile(imagePath)
if err != nil {
logger.Errorf("Error reading image file for job %s: %v", job.ID, err)
jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error reading image file: %v", err))
return
}
ocrText, err := app.doOCRViaLLM(ctx, imageContent)
if err != nil {
logger.Errorf("Error performing OCR for job %s: %v", job.ID, err)
jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error performing OCR: %v", err))
return
}
ocrTexts = append(ocrTexts, ocrText)
jobStore.updatePagesDone(job.ID, i+1) // Update PagesDone after each page is processed
}
// Combine the OCR texts
fullOcrText := strings.Join(ocrTexts, "\n\n")
// Update job status and result
jobStore.updateJobStatus(job.ID, "completed", fullOcrText)
logger.Infof("Job completed: %s", job.ID)
}

94
main.go
View file

@ -30,8 +30,10 @@ var (
paperlessBaseURL = os.Getenv("PAPERLESS_BASE_URL")
paperlessAPIToken = os.Getenv("PAPERLESS_API_TOKEN")
openaiAPIKey = os.Getenv("OPENAI_API_KEY")
manualTag = "paperless-gpt"
autoTag = "paperless-gpt-auto"
manualTag = os.Getenv("MANUAL_TAG")
autoTag = os.Getenv("AUTO_TAG")
manualOcrTag = os.Getenv("MANUAL_OCR_TAG") // Not used yet
autoOcrTag = os.Getenv("AUTO_OCR_TAG")
llmProvider = os.Getenv("LLM_PROVIDER")
llmModel = os.Getenv("LLM_MODEL")
visionLlmProvider = os.Getenv("VISION_LLM_PROVIDER")
@ -72,7 +74,7 @@ Please concisely select the {{.Language}} tags from the list above that best des
Be very selective and only choose the most relevant tags since too many tags will make the document less discoverable.
`
defaultOcrPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format.`
defaultOcrPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format but without a code block.`
)
// App struct to hold dependencies
@ -85,7 +87,7 @@ type App struct {
func main() {
// Validate Environment Variables
validateEnvVars()
validateOrDefaultEnvVars()
// Initialize logrus logger
initLogger()
@ -127,7 +129,23 @@ func main() {
backoffDuration := minBackoffDuration
for {
processedCount, err := app.processAutoTagDocuments()
processedCount, err := func() (int, error) {
count := 0
if isOcrEnabled() {
ocrCount, err := app.processAutoOcrTagDocuments()
if err != nil {
return 0, fmt.Errorf("error in processAutoOcrTagDocuments: %w", err)
}
count += ocrCount
}
autoCount, err := app.processAutoTagDocuments()
if err != nil {
return 0, fmt.Errorf("error in processAutoTagDocuments: %w", err)
}
count += autoCount
return count, nil
}()
if err != nil {
log.Errorf("Error in processAutoTagDocuments: %v", err)
time.Sleep(backoffDuration)
@ -242,8 +260,32 @@ func isOcrEnabled() bool {
return visionLlmModel != "" && visionLlmProvider != ""
}
// validateEnvVars ensures all necessary environment variables are set
func validateEnvVars() {
// validateOrDefaultEnvVars ensures all necessary environment variables are set
func validateOrDefaultEnvVars() {
if manualTag == "" {
manualTag = "paperless-gpt"
}
fmt.Printf("Using %s as manual tag\n", manualTag)
if autoTag == "" {
autoTag = "paperless-gpt-auto"
}
fmt.Printf("Using %s as auto tag\n", autoTag)
if manualOcrTag == "" {
manualOcrTag = "paperless-gpt-ocr"
}
if isOcrEnabled() {
fmt.Printf("Using %s as manual OCR tag\n", manualOcrTag)
}
if autoOcrTag == "" {
autoOcrTag = "paperless-gpt-ocr-auto"
}
if isOcrEnabled() {
fmt.Printf("Using %s as auto OCR tag\n", autoOcrTag)
}
if paperlessBaseURL == "" {
log.Fatal("Please set the PAPERLESS_BASE_URL environment variable.")
}
@ -306,6 +348,44 @@ func (app *App) processAutoTagDocuments() (int, error) {
return len(documents), nil
}
// processAutoOcrTagDocuments handles the background auto-tagging of OCR documents
func (app *App) processAutoOcrTagDocuments() (int, error) {
ctx := context.Background()
documents, err := app.Client.GetDocumentsByTags(ctx, []string{autoOcrTag})
if err != nil {
return 0, fmt.Errorf("error fetching documents with autoOcrTag: %w", err)
}
if len(documents) == 0 {
log.Debugf("No documents with tag %s found", autoOcrTag)
return 0, nil // No documents to process
}
log.Debugf("Found at least %d remaining documents with tag %s", len(documents), autoOcrTag)
documents = documents[:1] // Process only one document at a time
ocrContent, err := app.ProcessDocumentOCR(ctx, documents[0].ID)
if err != nil {
return 0, fmt.Errorf("error processing document OCR: %w", err)
}
log.Debugf("OCR content for document %d: %s", documents[0].ID, ocrContent)
err = app.Client.UpdateDocuments(ctx, []DocumentSuggestion{
{
ID: documents[0].ID,
OriginalDocument: documents[0],
SuggestedContent: ocrContent,
},
}, app.Database, false)
if err != nil {
return 0, fmt.Errorf("error updating documents: %w", err)
}
return 1, nil // Processed one document
}
// removeTagFromList removes a specific tag from a list of tags
func removeTagFromList(tags []string, tagToRemove string) []string {
filteredTags := []string{}

39
ocr.go Normal file
View file

@ -0,0 +1,39 @@
package main
import (
"context"
"fmt"
"os"
"strings"
)
// ProcessDocumentOCR processes a document through OCR and returns the combined text
func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID)
defer func() {
for _, imagePath := range imagePaths {
os.Remove(imagePath)
}
}()
if err != nil {
return "", fmt.Errorf("error downloading document images: %w", err)
}
var ocrTexts []string
for _, imagePath := range imagePaths {
imageContent, err := os.ReadFile(imagePath)
if err != nil {
return "", fmt.Errorf("error reading image file: %w", err)
}
ocrText, err := app.doOCRViaLLM(ctx, imageContent)
if err != nil {
return "", fmt.Errorf("error performing OCR: %w", err)
}
log.Debugf("OCR text: %s", ocrText)
ocrTexts = append(ocrTexts, ocrText)
}
return strings.Join(ocrTexts, "\n\n"), nil
}

View file

@ -273,6 +273,7 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum
// remove autoTag to prevent infinite loop (even if it is in the original tags)
originalTags = removeTagFromList(originalTags, autoTag)
originalTags = removeTagFromList(originalTags, autoOcrTag)
if len(tags) == 0 {
tags = originalTags