feat: add support automatic OCR (#75)

2025-03-12 04:48:02 -05:00 · 2025-01-06 23:03:41 +01:00 · 2025-01-06 23:03:41 +01:00 · 32f83ec93f
commit 32f83ec93f
parent 99ad4883e8
5 changed files with 136 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -71,6 +71,8 @@ services:
      PAPERLESS_BASE_URL: 'http://paperless-ngx:8000'
      PAPERLESS_API_TOKEN: 'your_paperless_api_token'
      PAPERLESS_PUBLIC_URL: 'http://paperless.mydomain.com' # Optional, your public link to access Paperless
+      MANUAL_TAG: 'paperless-gpt' # Optional, default is 'paperless-gpt'
+      AUTO_TAG: 'paperless-gpt-auto' # Optional, default is 'paperless-gpt-auto'
      LLM_PROVIDER: 'openai' # or 'ollama'
      LLM_MODEL: 'gpt-4o'     # or 'llama2'
      OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
@ -78,6 +80,7 @@ services:
      OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
      VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai
      VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
+      AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default is 'paperless-gpt-ocr-auto'
      LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
      LISTEN_INTERFACE: '127.0.0.1:8080' # Optional, default is ':8080'
      WEBUI_PATH: '/usr/share/paperless-gpt/webui' # Optional, default is './web-app/dist'
@ -141,6 +144,8 @@ If you prefer to run the application manually:
 | `PAPERLESS_BASE_URL`  | The base URL of your paperless-ngx instance (e.g., `http://paperless-ngx:8000`).                                                                          | Yes      |
 | `PAPERLESS_API_TOKEN` | API token for accessing paperless-ngx. You can generate one in the paperless-ngx admin interface.                                                         | Yes      |
 | `PAPERLESS_PUBLIC_URL` | The public URL for your Paperless instance, if it is different to your `PAPERLESS_BASE_URL` - say if you are running in Docker Compose | No |
+| `MANUAL_TAG`          | The tag to use for manually processing documents. Default is `paperless-gpt`.                                                                             | No       |
+| `AUTO_TAG`            | The tag to use for automatically processing documents. Default is `paperless-gpt-auto`.                                                                   | No       |
 | `LLM_PROVIDER`        | The LLM provider to use (`openai` or `ollama`).                                                                                                           | Yes      |
 | `LLM_MODEL`           | The model name to use (e.g., `gpt-4o`, `gpt-3.5-turbo`, `llama2`).                                                                                        | Yes      |
 | `OPENAI_API_KEY`      | Your OpenAI API key. Required if using OpenAI as the LLM provider.                                                                                        | Cond.    |
@ -148,6 +153,7 @@ If you prefer to run the application manually:
 | `OLLAMA_HOST`         | The URL of the Ollama server (e.g., `http://host.docker.internal:11434`). Useful if using Ollama. Default is `http://127.0.0.1:11434`.                    | No       |
 | `VISION_LLM_PROVIDER` | The vision LLM provider to use for OCR (`openai` or `ollama`).                                                                                            | No       |
 | `VISION_LLM_MODEL`    | The model name to use for OCR (e.g., `minicpm-v`).                                                                                                        | No       |
+| `AUTO_OCR_TAG`        | The tag to use for automatically processing documents with OCR. Default is `paperless-gpt-ocr-auto`.                                                      | No       |
 | `LOG_LEVEL`           | The log level for the application (`info`, `debug`, `warn`, `error`). Default is `info`.                                                                  | No       |
 | `LISTEN_INTERFACE`    | The interface paperless-gpt listens to. Default is `:8080`                                                                                                | No       |
 | `WEBUI_PATH`          | The path to load static content from. Default is `./web-app/dist`                                                                                         | No       |
--- a/jobs.go
+++ b/jobs.go
@ -2,10 +2,8 @@ package main

 import (
 	"context"
-	"fmt"
 	"os"
 	"sort"
-	"strings"
 	"sync"
 	"time"

@ -125,38 +123,13 @@ func processJob(app *App, job *Job) {

 	ctx := context.Background()

-	// Download images of the document
-	imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, job.DocumentID)
+	fullOcrText, err := app.ProcessDocumentOCR(ctx, job.DocumentID)
 	if err != nil {
-		logger.Infof("Error downloading document images for job %s: %v", job.ID, err)
-		jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error downloading document images: %v", err))
+		logger.Errorf("Error processing document OCR for job %s: %v", job.ID, err)
+		jobStore.updateJobStatus(job.ID, "failed", err.Error())
 		return
 	}

-	var ocrTexts []string
-	for i, imagePath := range imagePaths {
-		imageContent, err := os.ReadFile(imagePath)
-		if err != nil {
-			logger.Errorf("Error reading image file for job %s: %v", job.ID, err)
-			jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error reading image file: %v", err))
-			return
-		}
-
-		ocrText, err := app.doOCRViaLLM(ctx, imageContent)
-		if err != nil {
-			logger.Errorf("Error performing OCR for job %s: %v", job.ID, err)
-			jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error performing OCR: %v", err))
-			return
-		}
-
-		ocrTexts = append(ocrTexts, ocrText)
-		jobStore.updatePagesDone(job.ID, i+1) // Update PagesDone after each page is processed
-	}
-
-	// Combine the OCR texts
-	fullOcrText := strings.Join(ocrTexts, "\n\n")
-
-	// Update job status and result
 	jobStore.updateJobStatus(job.ID, "completed", fullOcrText)
 	logger.Infof("Job completed: %s", job.ID)
 }
--- a/main.go
+++ b/main.go
@ -30,8 +30,10 @@ var (
 	paperlessBaseURL  = os.Getenv("PAPERLESS_BASE_URL")
 	paperlessAPIToken = os.Getenv("PAPERLESS_API_TOKEN")
 	openaiAPIKey      = os.Getenv("OPENAI_API_KEY")
-	manualTag         = "paperless-gpt"
-	autoTag           = "paperless-gpt-auto"
+	manualTag         = os.Getenv("MANUAL_TAG")
+	autoTag           = os.Getenv("AUTO_TAG")
+	manualOcrTag      = os.Getenv("MANUAL_OCR_TAG") // Not used yet
+	autoOcrTag        = os.Getenv("AUTO_OCR_TAG")
 	llmProvider       = os.Getenv("LLM_PROVIDER")
 	llmModel          = os.Getenv("LLM_MODEL")
 	visionLlmProvider = os.Getenv("VISION_LLM_PROVIDER")
@ -72,7 +74,7 @@ Please concisely select the {{.Language}} tags from the list above that best des
 Be very selective and only choose the most relevant tags since too many tags will make the document less discoverable.
 `

-	defaultOcrPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format.`
+	defaultOcrPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format but without a code block.`
 )

 // App struct to hold dependencies
@ -85,7 +87,7 @@ type App struct {

 func main() {
 	// Validate Environment Variables
-	validateEnvVars()
+	validateOrDefaultEnvVars()

 	// Initialize logrus logger
 	initLogger()
@ -127,7 +129,23 @@ func main() {

 		backoffDuration := minBackoffDuration
 		for {
-			processedCount, err := app.processAutoTagDocuments()
+			processedCount, err := func() (int, error) {
+				count := 0
+				if isOcrEnabled() {
+					ocrCount, err := app.processAutoOcrTagDocuments()
+					if err != nil {
+						return 0, fmt.Errorf("error in processAutoOcrTagDocuments: %w", err)
+					}
+					count += ocrCount
+				}
+				autoCount, err := app.processAutoTagDocuments()
+				if err != nil {
+					return 0, fmt.Errorf("error in processAutoTagDocuments: %w", err)
+				}
+				count += autoCount
+				return count, nil
+			}()
+
 			if err != nil {
 				log.Errorf("Error in processAutoTagDocuments: %v", err)
 				time.Sleep(backoffDuration)
@ -242,8 +260,32 @@ func isOcrEnabled() bool {
 	return visionLlmModel != "" && visionLlmProvider != ""
 }

-// validateEnvVars ensures all necessary environment variables are set
-func validateEnvVars() {
+// validateOrDefaultEnvVars ensures all necessary environment variables are set
+func validateOrDefaultEnvVars() {
+	if manualTag == "" {
+		manualTag = "paperless-gpt"
+	}
+	fmt.Printf("Using %s as manual tag\n", manualTag)
+
+	if autoTag == "" {
+		autoTag = "paperless-gpt-auto"
+	}
+	fmt.Printf("Using %s as auto tag\n", autoTag)
+
+	if manualOcrTag == "" {
+		manualOcrTag = "paperless-gpt-ocr"
+	}
+	if isOcrEnabled() {
+		fmt.Printf("Using %s as manual OCR tag\n", manualOcrTag)
+	}
+
+	if autoOcrTag == "" {
+		autoOcrTag = "paperless-gpt-ocr-auto"
+	}
+	if isOcrEnabled() {
+		fmt.Printf("Using %s as auto OCR tag\n", autoOcrTag)
+	}
+
 	if paperlessBaseURL == "" {
 		log.Fatal("Please set the PAPERLESS_BASE_URL environment variable.")
 	}
@ -306,6 +348,44 @@ func (app *App) processAutoTagDocuments() (int, error) {
 	return len(documents), nil
 }

+// processAutoOcrTagDocuments handles the background auto-tagging of OCR documents
+func (app *App) processAutoOcrTagDocuments() (int, error) {
+	ctx := context.Background()
+
+	documents, err := app.Client.GetDocumentsByTags(ctx, []string{autoOcrTag})
+	if err != nil {
+		return 0, fmt.Errorf("error fetching documents with autoOcrTag: %w", err)
+	}
+
+	if len(documents) == 0 {
+		log.Debugf("No documents with tag %s found", autoOcrTag)
+		return 0, nil // No documents to process
+	}
+
+	log.Debugf("Found at least %d remaining documents with tag %s", len(documents), autoOcrTag)
+
+	documents = documents[:1] // Process only one document at a time
+
+	ocrContent, err := app.ProcessDocumentOCR(ctx, documents[0].ID)
+	if err != nil {
+		return 0, fmt.Errorf("error processing document OCR: %w", err)
+	}
+	log.Debugf("OCR content for document %d: %s", documents[0].ID, ocrContent)
+
+	err = app.Client.UpdateDocuments(ctx, []DocumentSuggestion{
+		{
+			ID:               documents[0].ID,
+			OriginalDocument: documents[0],
+			SuggestedContent: ocrContent,
+		},
+	}, app.Database, false)
+	if err != nil {
+		return 0, fmt.Errorf("error updating documents: %w", err)
+	}
+
+	return 1, nil // Processed one document
+}
+
 // removeTagFromList removes a specific tag from a list of tags
 func removeTagFromList(tags []string, tagToRemove string) []string {
 	filteredTags := []string{}
--- a/ocr.go
+++ b/ocr.go
@ -0,0 +1,39 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strings"
+)
+
+// ProcessDocumentOCR processes a document through OCR and returns the combined text
+func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
+	imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID)
+	defer func() {
+		for _, imagePath := range imagePaths {
+			os.Remove(imagePath)
+		}
+	}()
+	if err != nil {
+		return "", fmt.Errorf("error downloading document images: %w", err)
+	}
+
+	var ocrTexts []string
+	for _, imagePath := range imagePaths {
+		imageContent, err := os.ReadFile(imagePath)
+		if err != nil {
+			return "", fmt.Errorf("error reading image file: %w", err)
+		}
+
+		ocrText, err := app.doOCRViaLLM(ctx, imageContent)
+		if err != nil {
+			return "", fmt.Errorf("error performing OCR: %w", err)
+		}
+		log.Debugf("OCR text: %s", ocrText)
+
+		ocrTexts = append(ocrTexts, ocrText)
+	}
+
+	return strings.Join(ocrTexts, "\n\n"), nil
+}
--- a/paperless.go
+++ b/paperless.go
@ -273,6 +273,7 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum

 		// remove autoTag to prevent infinite loop (even if it is in the original tags)
 		originalTags = removeTagFromList(originalTags, autoTag)
+		originalTags = removeTagFromList(originalTags, autoOcrTag)

 		if len(tags) == 0 {
 			tags = originalTags