From 32f83ec93f969ef930a55e31b752611a8262c5c8 Mon Sep 17 00:00:00 2001
From: Icereed <domi@icereed.net>
Date: Mon, 6 Jan 2025 23:03:41 +0100
Subject: [PATCH] feat: add support automatic OCR (#75)

---
 README.md    |  6 ++++
 jobs.go      | 33 ++----------------
 main.go      | 94 ++++++++++++++++++++++++++++++++++++++++++++++++----
 ocr.go       | 39 ++++++++++++++++++++++
 paperless.go |  1 +
 5 files changed, 136 insertions(+), 37 deletions(-)
 create mode 100644 ocr.go

diff --git a/README.md b/README.md
index 4276357..77ff36a 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,8 @@ services:
       PAPERLESS_BASE_URL: 'http://paperless-ngx:8000'
       PAPERLESS_API_TOKEN: 'your_paperless_api_token'
       PAPERLESS_PUBLIC_URL: 'http://paperless.mydomain.com' # Optional, your public link to access Paperless
+      MANUAL_TAG: 'paperless-gpt' # Optional, default is 'paperless-gpt'
+      AUTO_TAG: 'paperless-gpt-auto' # Optional, default is 'paperless-gpt-auto'
       LLM_PROVIDER: 'openai' # or 'ollama'
       LLM_MODEL: 'gpt-4o'     # or 'llama2'
       OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
@@ -78,6 +80,7 @@ services:
       OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
       VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai
       VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
+      AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default is 'paperless-gpt-ocr-auto'
       LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
       LISTEN_INTERFACE: '127.0.0.1:8080' # Optional, default is ':8080'
       WEBUI_PATH: '/usr/share/paperless-gpt/webui' # Optional, default is './web-app/dist'
@@ -141,6 +144,8 @@ If you prefer to run the application manually:
 | `PAPERLESS_BASE_URL`  | The base URL of your paperless-ngx instance (e.g., `http://paperless-ngx:8000`).                                                                          | Yes      |
 | `PAPERLESS_API_TOKEN` | API token for accessing paperless-ngx. You can generate one in the paperless-ngx admin interface.                                                         | Yes      |
 | `PAPERLESS_PUBLIC_URL` | The public URL for your Paperless instance, if it is different to your `PAPERLESS_BASE_URL` - say if you are running in Docker Compose | No |
+| `MANUAL_TAG`          | The tag to use for manually processing documents. Default is `paperless-gpt`.                                                                             | No       |
+| `AUTO_TAG`            | The tag to use for automatically processing documents. Default is `paperless-gpt-auto`.                                                                   | No       |
 | `LLM_PROVIDER`        | The LLM provider to use (`openai` or `ollama`).                                                                                                           | Yes      |
 | `LLM_MODEL`           | The model name to use (e.g., `gpt-4o`, `gpt-3.5-turbo`, `llama2`).                                                                                        | Yes      |
 | `OPENAI_API_KEY`      | Your OpenAI API key. Required if using OpenAI as the LLM provider.                                                                                        | Cond.    |
@@ -148,6 +153,7 @@ If you prefer to run the application manually:
 | `OLLAMA_HOST`         | The URL of the Ollama server (e.g., `http://host.docker.internal:11434`). Useful if using Ollama. Default is `http://127.0.0.1:11434`.                    | No       |
 | `VISION_LLM_PROVIDER` | The vision LLM provider to use for OCR (`openai` or `ollama`).                                                                                            | No       |
 | `VISION_LLM_MODEL`    | The model name to use for OCR (e.g., `minicpm-v`).                                                                                                        | No       |
+| `AUTO_OCR_TAG`        | The tag to use for automatically processing documents with OCR. Default is `paperless-gpt-ocr-auto`.                                                      | No       |
 | `LOG_LEVEL`           | The log level for the application (`info`, `debug`, `warn`, `error`). Default is `info`.                                                                  | No       |
 | `LISTEN_INTERFACE`    | The interface paperless-gpt listens to. Default is `:8080`                                                                                                | No       |
 | `WEBUI_PATH`          | The path to load static content from. Default is `./web-app/dist`                                                                                         | No       |
diff --git a/jobs.go b/jobs.go
index bc58b82..7b21876 100644
--- a/jobs.go
+++ b/jobs.go
@@ -2,10 +2,8 @@ package main
 
 import (
 	"context"
-	"fmt"
 	"os"
 	"sort"
-	"strings"
 	"sync"
 	"time"
 
@@ -125,38 +123,13 @@ func processJob(app *App, job *Job) {
 
 	ctx := context.Background()
 
-	// Download images of the document
-	imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, job.DocumentID)
+	fullOcrText, err := app.ProcessDocumentOCR(ctx, job.DocumentID)
 	if err != nil {
-		logger.Infof("Error downloading document images for job %s: %v", job.ID, err)
-		jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error downloading document images: %v", err))
+		logger.Errorf("Error processing document OCR for job %s: %v", job.ID, err)
+		jobStore.updateJobStatus(job.ID, "failed", err.Error())
 		return
 	}
 
-	var ocrTexts []string
-	for i, imagePath := range imagePaths {
-		imageContent, err := os.ReadFile(imagePath)
-		if err != nil {
-			logger.Errorf("Error reading image file for job %s: %v", job.ID, err)
-			jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error reading image file: %v", err))
-			return
-		}
-
-		ocrText, err := app.doOCRViaLLM(ctx, imageContent)
-		if err != nil {
-			logger.Errorf("Error performing OCR for job %s: %v", job.ID, err)
-			jobStore.updateJobStatus(job.ID, "failed", fmt.Sprintf("Error performing OCR: %v", err))
-			return
-		}
-
-		ocrTexts = append(ocrTexts, ocrText)
-		jobStore.updatePagesDone(job.ID, i+1) // Update PagesDone after each page is processed
-	}
-
-	// Combine the OCR texts
-	fullOcrText := strings.Join(ocrTexts, "\n\n")
-
-	// Update job status and result
 	jobStore.updateJobStatus(job.ID, "completed", fullOcrText)
 	logger.Infof("Job completed: %s", job.ID)
 }
diff --git a/main.go b/main.go
index 91c86a7..b795ad8 100644
--- a/main.go
+++ b/main.go
@@ -30,8 +30,10 @@ var (
 	paperlessBaseURL  = os.Getenv("PAPERLESS_BASE_URL")
 	paperlessAPIToken = os.Getenv("PAPERLESS_API_TOKEN")
 	openaiAPIKey      = os.Getenv("OPENAI_API_KEY")
-	manualTag         = "paperless-gpt"
-	autoTag           = "paperless-gpt-auto"
+	manualTag         = os.Getenv("MANUAL_TAG")
+	autoTag           = os.Getenv("AUTO_TAG")
+	manualOcrTag      = os.Getenv("MANUAL_OCR_TAG") // Not used yet
+	autoOcrTag        = os.Getenv("AUTO_OCR_TAG")
 	llmProvider       = os.Getenv("LLM_PROVIDER")
 	llmModel          = os.Getenv("LLM_MODEL")
 	visionLlmProvider = os.Getenv("VISION_LLM_PROVIDER")
@@ -72,7 +74,7 @@ Please concisely select the {{.Language}} tags from the list above that best des
 Be very selective and only choose the most relevant tags since too many tags will make the document less discoverable.
 `
 
-	defaultOcrPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format.`
+	defaultOcrPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format but without a code block.`
 )
 
 // App struct to hold dependencies
@@ -85,7 +87,7 @@ type App struct {
 
 func main() {
 	// Validate Environment Variables
-	validateEnvVars()
+	validateOrDefaultEnvVars()
 
 	// Initialize logrus logger
 	initLogger()
@@ -127,7 +129,23 @@ func main() {
 
 		backoffDuration := minBackoffDuration
 		for {
-			processedCount, err := app.processAutoTagDocuments()
+			processedCount, err := func() (int, error) {
+				count := 0
+				if isOcrEnabled() {
+					ocrCount, err := app.processAutoOcrTagDocuments()
+					if err != nil {
+						return 0, fmt.Errorf("error in processAutoOcrTagDocuments: %w", err)
+					}
+					count += ocrCount
+				}
+				autoCount, err := app.processAutoTagDocuments()
+				if err != nil {
+					return 0, fmt.Errorf("error in processAutoTagDocuments: %w", err)
+				}
+				count += autoCount
+				return count, nil
+			}()
+
 			if err != nil {
 				log.Errorf("Error in processAutoTagDocuments: %v", err)
 				time.Sleep(backoffDuration)
@@ -242,8 +260,32 @@ func isOcrEnabled() bool {
 	return visionLlmModel != "" && visionLlmProvider != ""
 }
 
-// validateEnvVars ensures all necessary environment variables are set
-func validateEnvVars() {
+// validateOrDefaultEnvVars ensures all necessary environment variables are set
+func validateOrDefaultEnvVars() {
+	if manualTag == "" {
+		manualTag = "paperless-gpt"
+	}
+	fmt.Printf("Using %s as manual tag\n", manualTag)
+
+	if autoTag == "" {
+		autoTag = "paperless-gpt-auto"
+	}
+	fmt.Printf("Using %s as auto tag\n", autoTag)
+
+	if manualOcrTag == "" {
+		manualOcrTag = "paperless-gpt-ocr"
+	}
+	if isOcrEnabled() {
+		fmt.Printf("Using %s as manual OCR tag\n", manualOcrTag)
+	}
+
+	if autoOcrTag == "" {
+		autoOcrTag = "paperless-gpt-ocr-auto"
+	}
+	if isOcrEnabled() {
+		fmt.Printf("Using %s as auto OCR tag\n", autoOcrTag)
+	}
+
 	if paperlessBaseURL == "" {
 		log.Fatal("Please set the PAPERLESS_BASE_URL environment variable.")
 	}
@@ -306,6 +348,44 @@ func (app *App) processAutoTagDocuments() (int, error) {
 	return len(documents), nil
 }
 
+// processAutoOcrTagDocuments handles the background auto-tagging of OCR documents
+func (app *App) processAutoOcrTagDocuments() (int, error) {
+	ctx := context.Background()
+
+	documents, err := app.Client.GetDocumentsByTags(ctx, []string{autoOcrTag})
+	if err != nil {
+		return 0, fmt.Errorf("error fetching documents with autoOcrTag: %w", err)
+	}
+
+	if len(documents) == 0 {
+		log.Debugf("No documents with tag %s found", autoOcrTag)
+		return 0, nil // No documents to process
+	}
+
+	log.Debugf("Found at least %d remaining documents with tag %s", len(documents), autoOcrTag)
+
+	documents = documents[:1] // Process only one document at a time
+
+	ocrContent, err := app.ProcessDocumentOCR(ctx, documents[0].ID)
+	if err != nil {
+		return 0, fmt.Errorf("error processing document OCR: %w", err)
+	}
+	log.Debugf("OCR content for document %d: %s", documents[0].ID, ocrContent)
+
+	err = app.Client.UpdateDocuments(ctx, []DocumentSuggestion{
+		{
+			ID:               documents[0].ID,
+			OriginalDocument: documents[0],
+			SuggestedContent: ocrContent,
+		},
+	}, app.Database, false)
+	if err != nil {
+		return 0, fmt.Errorf("error updating documents: %w", err)
+	}
+
+	return 1, nil // Processed one document
+}
+
 // removeTagFromList removes a specific tag from a list of tags
 func removeTagFromList(tags []string, tagToRemove string) []string {
 	filteredTags := []string{}
diff --git a/ocr.go b/ocr.go
new file mode 100644
index 0000000..ca8ed28
--- /dev/null
+++ b/ocr.go
@@ -0,0 +1,39 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strings"
+)
+
+// ProcessDocumentOCR processes a document through OCR and returns the combined text
+func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
+	imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID)
+	defer func() {
+		for _, imagePath := range imagePaths {
+			os.Remove(imagePath)
+		}
+	}()
+	if err != nil {
+		return "", fmt.Errorf("error downloading document images: %w", err)
+	}
+
+	var ocrTexts []string
+	for _, imagePath := range imagePaths {
+		imageContent, err := os.ReadFile(imagePath)
+		if err != nil {
+			return "", fmt.Errorf("error reading image file: %w", err)
+		}
+
+		ocrText, err := app.doOCRViaLLM(ctx, imageContent)
+		if err != nil {
+			return "", fmt.Errorf("error performing OCR: %w", err)
+		}
+		log.Debugf("OCR text: %s", ocrText)
+
+		ocrTexts = append(ocrTexts, ocrText)
+	}
+
+	return strings.Join(ocrTexts, "\n\n"), nil
+}
diff --git a/paperless.go b/paperless.go
index 6cff47c..a96fc1e 100644
--- a/paperless.go
+++ b/paperless.go
@@ -273,6 +273,7 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum
 
 		// remove autoTag to prevent infinite loop (even if it is in the original tags)
 		originalTags = removeTagFromList(originalTags, autoTag)
+		originalTags = removeTagFromList(originalTags, autoOcrTag)
 
 		if len(tags) == 0 {
 			tags = originalTags