Limit pages for OCR

Bonus: Adds logging for image dimensions and size Closes #95
2025-03-13 05:08:01 -05:00 · 2025-01-10 16:44:41 +01:00 · 2025-01-10 16:44:41 +01:00 · ebc26b5bdf
commit ebc26b5bdf
parent d72fbcb527
6 changed files with 48 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -97,6 +97,7 @@ services:
      VISION_LLM_PROVIDER: 'ollama'        # (for OCR) - openai or ollama
      VISION_LLM_MODEL: 'minicpm-v'        # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc.
      AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto
+      OCR_LIMIT_PAGES: '5'                 # Optional, default: 5. Set to 0 for no limit.
      LOG_LEVEL: 'info'                    # Optional: debug, warn, error
    volumes:
      - ./prompts:/app/prompts   # Mount the prompts directory
@ -166,6 +167,7 @@ services:
 | `WEBUI_PATH`           | Path for static content. Default: `./web-app/dist`.                                                             | No       |
 | `AUTO_GENERATE_TITLE`  | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`.                                  | No       |
 | `AUTO_GENERATE_TAGS`   | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`.                                   | No       |
+| `OCR_LIMIT_PAGES`      | Limit the number of pages for OCR. Set to `0` for no limit. Default: `5`.                                       | No       |

 ### Custom Prompt Templates

--- a/app_llm.go
+++ b/app_llm.go
@ -5,9 +5,12 @@ import (
 	"context"
 	"encoding/base64"
 	"fmt"
+	"image"
 	"strings"
 	"sync"

+	_ "image/jpeg"
+
 	"github.com/tmc/langchaingo/llms"
 )

@ -82,15 +85,27 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro

 	prompt := promptBuffer.String()

+	// Log the image dimensions
+	img, _, err := image.Decode(bytes.NewReader(jpegBytes))
+	if err != nil {
+		return "", fmt.Errorf("error decoding image: %v", err)
+	}
+	bounds := img.Bounds()
+	log.Debugf("Image dimensions: %dx%d", bounds.Dx(), bounds.Dy())
+
 	// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
 	var parts []llms.ContentPart
 	if strings.ToLower(visionLlmProvider) != "openai" {
+		// Log image size in kilobytes
+		log.Debugf("Image size: %d KB", len(jpegBytes)/1024)
 		parts = []llms.ContentPart{
 			llms.BinaryPart("image/jpeg", jpegBytes),
 			llms.TextPart(prompt),
 		}
 	} else {
 		base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
+		// Log image size in kilobytes
+		log.Debugf("Image size: %d KB", len(base64Image)/1024)
 		parts = []llms.ContentPart{
 			llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
 			llms.TextPart(prompt),
--- a/main.go
+++ b/main.go
@ -6,6 +6,7 @@ import (
 	"net/http"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"sync"
 	"text/template"
@ -43,6 +44,7 @@ var (
 	webuiPath         = os.Getenv("WEBUI_PATH")
 	autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE")
 	autoGenerateTags  = os.Getenv("AUTO_GENERATE_TAGS")
+	limitOcrPages     int // Will be read from OCR_LIMIT_PAGES

 	// Templates
 	titleTemplate *template.Template
@ -309,6 +311,19 @@ func validateOrDefaultEnvVars() {
 	if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" {
 		log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.")
 	}
+
+	if isOcrEnabled() {
+		rawLimitOcrPages := os.Getenv("OCR_LIMIT_PAGES")
+		if rawLimitOcrPages == "" {
+			limitOcrPages = 5
+		} else {
+			var err error
+			limitOcrPages, err = strconv.Atoi(rawLimitOcrPages)
+			if err != nil {
+				log.Fatalf("Invalid OCR_LIMIT_PAGES value: %v", err)
+			}
+		}
+	}
 }

 // processAutoTagDocuments handles the background auto-tagging of documents
--- a/ocr.go
+++ b/ocr.go
@ -9,7 +9,7 @@ import (

 // ProcessDocumentOCR processes a document through OCR and returns the combined text
 func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
-	imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID)
+	imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID, limitOcrPages)
 	defer func() {
 		for _, imagePath := range imagePaths {
 			os.Remove(imagePath)
--- a/paperless.go
+++ b/paperless.go
@ -391,7 +391,8 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum
 }

 // DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
-func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int) ([]string, error) {
+// If limitPages > 0, only the first N pages will be processed
+func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int, limitPages int) ([]string, error) {
 	// Create a directory named after the document ID
 	docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", documentId))
 	if _, err := os.Stat(docDir); os.IsNotExist(err) {
@ -404,6 +405,9 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
 	// Check if images already exist
 	var imagePaths []string
 	for n := 0; ; n++ {
+		if limitPages > 0 && n >= limitPages {
+			break
+		}
 		imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
 		if _, err := os.Stat(imagePath); os.IsNotExist(err) {
 			break
@ -452,10 +456,15 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
 	}
 	defer doc.Close()

+	totalPages := doc.NumPage()
+	if limitPages > 0 && limitPages < totalPages {
+		totalPages = limitPages
+	}
+
 	var mu sync.Mutex
 	var g errgroup.Group

-	for n := 0; n < doc.NumPage(); n++ {
+	for n := 0; n < totalPages; n++ {
 		n := n // capture loop variable
 		g.Go(func() error {
 			mu.Lock()
--- a/paperless_test.go
+++ b/paperless_test.go
@ -385,7 +385,7 @@ func TestDownloadDocumentAsImages(t *testing.T) {
 	})

 	ctx := context.Background()
-	imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID)
+	imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 0)
 	require.NoError(t, err)

 	// Verify that exatly one page was extracted
@ -422,11 +422,11 @@ func TestDownloadDocumentAsImages_ManyPages(t *testing.T) {
 	env.client.CacheFolder = "tests/tmp"
 	// Clean the cache folder
 	os.RemoveAll(env.client.CacheFolder)
-	imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID)
+	imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 50)
 	require.NoError(t, err)

-	// Verify that exatly 52 pages were extracted
-	assert.Len(t, imagePaths, 52)
+	// Verify that exatly 50 pages were extracted - the original doc contains 52 pages
+	assert.Len(t, imagePaths, 50)
 	// The path shall end with tests/tmp/document-321/page000.jpg
 	for _, imagePath := range imagePaths {
 		_, err := os.Stat(imagePath)