diff --git a/README.md b/README.md index d953dd4..fd0b0a7 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ services: VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc. AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto + OCR_LIMIT_PAGES: '5' # Optional, default: 5. Set to 0 for no limit. LOG_LEVEL: 'info' # Optional: debug, warn, error volumes: - ./prompts:/app/prompts # Mount the prompts directory @@ -166,6 +167,7 @@ services: | `WEBUI_PATH` | Path for static content. Default: `./web-app/dist`. | No | | `AUTO_GENERATE_TITLE` | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`. | No | | `AUTO_GENERATE_TAGS` | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`. | No | +| `OCR_LIMIT_PAGES` | Limit the number of pages for OCR. Set to `0` for no limit. Default: `5`. | No | ### Custom Prompt Templates diff --git a/app_llm.go b/app_llm.go index a4a4c6b..22883a5 100644 --- a/app_llm.go +++ b/app_llm.go @@ -5,9 +5,12 @@ import ( "context" "encoding/base64" "fmt" + "image" "strings" "sync" + _ "image/jpeg" + "github.com/tmc/langchaingo/llms" ) @@ -82,15 +85,27 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro prompt := promptBuffer.String() + // Log the image dimensions + img, _, err := image.Decode(bytes.NewReader(jpegBytes)) + if err != nil { + return "", fmt.Errorf("error decoding image: %v", err) + } + bounds := img.Bounds() + log.Debugf("Image dimensions: %dx%d", bounds.Dx(), bounds.Dy()) + // If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision var parts []llms.ContentPart if strings.ToLower(visionLlmProvider) != "openai" { + // Log image size in kilobytes + log.Debugf("Image size: %d KB", len(jpegBytes)/1024) parts = []llms.ContentPart{ llms.BinaryPart("image/jpeg", jpegBytes), llms.TextPart(prompt), } } else { base64Image := base64.StdEncoding.EncodeToString(jpegBytes) + // Log image size in kilobytes + log.Debugf("Image size: %d KB", len(base64Image)/1024) parts = []llms.ContentPart{ llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)), llms.TextPart(prompt), diff --git a/main.go b/main.go index b795ad8..783fa7e 100644 --- a/main.go +++ b/main.go @@ -6,6 +6,7 @@ import ( "net/http" "os" "path/filepath" + "strconv" "strings" "sync" "text/template" @@ -43,6 +44,7 @@ var ( webuiPath = os.Getenv("WEBUI_PATH") autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE") autoGenerateTags = os.Getenv("AUTO_GENERATE_TAGS") + limitOcrPages int // Will be read from OCR_LIMIT_PAGES // Templates titleTemplate *template.Template @@ -309,6 +311,19 @@ func validateOrDefaultEnvVars() { if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" { log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.") } + + if isOcrEnabled() { + rawLimitOcrPages := os.Getenv("OCR_LIMIT_PAGES") + if rawLimitOcrPages == "" { + limitOcrPages = 5 + } else { + var err error + limitOcrPages, err = strconv.Atoi(rawLimitOcrPages) + if err != nil { + log.Fatalf("Invalid OCR_LIMIT_PAGES value: %v", err) + } + } + } } // processAutoTagDocuments handles the background auto-tagging of documents diff --git a/ocr.go b/ocr.go index ca8ed28..b96d74a 100644 --- a/ocr.go +++ b/ocr.go @@ -9,7 +9,7 @@ import ( // ProcessDocumentOCR processes a document through OCR and returns the combined text func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) { - imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID) + imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID, limitOcrPages) defer func() { for _, imagePath := range imagePaths { os.Remove(imagePath) diff --git a/paperless.go b/paperless.go index a96fc1e..16ab98b 100644 --- a/paperless.go +++ b/paperless.go @@ -391,7 +391,8 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum } // DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images -func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int) ([]string, error) { +// If limitPages > 0, only the first N pages will be processed +func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int, limitPages int) ([]string, error) { // Create a directory named after the document ID docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", documentId)) if _, err := os.Stat(docDir); os.IsNotExist(err) { @@ -404,6 +405,9 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document // Check if images already exist var imagePaths []string for n := 0; ; n++ { + if limitPages > 0 && n >= limitPages { + break + } imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n)) if _, err := os.Stat(imagePath); os.IsNotExist(err) { break @@ -452,10 +456,15 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document } defer doc.Close() + totalPages := doc.NumPage() + if limitPages > 0 && limitPages < totalPages { + totalPages = limitPages + } + var mu sync.Mutex var g errgroup.Group - for n := 0; n < doc.NumPage(); n++ { + for n := 0; n < totalPages; n++ { n := n // capture loop variable g.Go(func() error { mu.Lock() diff --git a/paperless_test.go b/paperless_test.go index c75049f..85e056a 100644 --- a/paperless_test.go +++ b/paperless_test.go @@ -385,7 +385,7 @@ func TestDownloadDocumentAsImages(t *testing.T) { }) ctx := context.Background() - imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID) + imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 0) require.NoError(t, err) // Verify that exatly one page was extracted @@ -422,11 +422,11 @@ func TestDownloadDocumentAsImages_ManyPages(t *testing.T) { env.client.CacheFolder = "tests/tmp" // Clean the cache folder os.RemoveAll(env.client.CacheFolder) - imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID) + imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 50) require.NoError(t, err) - // Verify that exatly 52 pages were extracted - assert.Len(t, imagePaths, 52) + // Verify that exatly 50 pages were extracted - the original doc contains 52 pages + assert.Len(t, imagePaths, 50) // The path shall end with tests/tmp/document-321/page000.jpg for _, imagePath := range imagePaths { _, err := os.Stat(imagePath)