Limit pages for OCR (#102)

Bonus: Adds logging for image dimensions and size

Closes #95
This commit is contained in:
Icereed 2025-01-10 17:03:53 +01:00 committed by GitHub
parent d72fbcb527
commit 38321c4ac9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 48 additions and 7 deletions

View file

@ -97,6 +97,7 @@ services:
VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama
VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc. VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc.
AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto
OCR_LIMIT_PAGES: '5' # Optional, default: 5. Set to 0 for no limit.
LOG_LEVEL: 'info' # Optional: debug, warn, error LOG_LEVEL: 'info' # Optional: debug, warn, error
volumes: volumes:
- ./prompts:/app/prompts # Mount the prompts directory - ./prompts:/app/prompts # Mount the prompts directory
@ -166,6 +167,7 @@ services:
| `WEBUI_PATH` | Path for static content. Default: `./web-app/dist`. | No | | `WEBUI_PATH` | Path for static content. Default: `./web-app/dist`. | No |
| `AUTO_GENERATE_TITLE` | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`. | No | | `AUTO_GENERATE_TITLE` | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
| `AUTO_GENERATE_TAGS` | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`. | No | | `AUTO_GENERATE_TAGS` | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
| `OCR_LIMIT_PAGES` | Limit the number of pages for OCR. Set to `0` for no limit. Default: `5`. | No |
### Custom Prompt Templates ### Custom Prompt Templates

View file

@ -5,9 +5,12 @@ import (
"context" "context"
"encoding/base64" "encoding/base64"
"fmt" "fmt"
"image"
"strings" "strings"
"sync" "sync"
_ "image/jpeg"
"github.com/tmc/langchaingo/llms" "github.com/tmc/langchaingo/llms"
) )
@ -82,15 +85,27 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
prompt := promptBuffer.String() prompt := promptBuffer.String()
// Log the image dimensions
img, _, err := image.Decode(bytes.NewReader(jpegBytes))
if err != nil {
return "", fmt.Errorf("error decoding image: %v", err)
}
bounds := img.Bounds()
log.Debugf("Image dimensions: %dx%d", bounds.Dx(), bounds.Dy())
// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision // If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
var parts []llms.ContentPart var parts []llms.ContentPart
if strings.ToLower(visionLlmProvider) != "openai" { if strings.ToLower(visionLlmProvider) != "openai" {
// Log image size in kilobytes
log.Debugf("Image size: %d KB", len(jpegBytes)/1024)
parts = []llms.ContentPart{ parts = []llms.ContentPart{
llms.BinaryPart("image/jpeg", jpegBytes), llms.BinaryPart("image/jpeg", jpegBytes),
llms.TextPart(prompt), llms.TextPart(prompt),
} }
} else { } else {
base64Image := base64.StdEncoding.EncodeToString(jpegBytes) base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
// Log image size in kilobytes
log.Debugf("Image size: %d KB", len(base64Image)/1024)
parts = []llms.ContentPart{ parts = []llms.ContentPart{
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)), llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
llms.TextPart(prompt), llms.TextPart(prompt),

15
main.go
View file

@ -6,6 +6,7 @@ import (
"net/http" "net/http"
"os" "os"
"path/filepath" "path/filepath"
"strconv"
"strings" "strings"
"sync" "sync"
"text/template" "text/template"
@ -43,6 +44,7 @@ var (
webuiPath = os.Getenv("WEBUI_PATH") webuiPath = os.Getenv("WEBUI_PATH")
autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE") autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE")
autoGenerateTags = os.Getenv("AUTO_GENERATE_TAGS") autoGenerateTags = os.Getenv("AUTO_GENERATE_TAGS")
limitOcrPages int // Will be read from OCR_LIMIT_PAGES
// Templates // Templates
titleTemplate *template.Template titleTemplate *template.Template
@ -309,6 +311,19 @@ func validateOrDefaultEnvVars() {
if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" { if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" {
log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.") log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.")
} }
if isOcrEnabled() {
rawLimitOcrPages := os.Getenv("OCR_LIMIT_PAGES")
if rawLimitOcrPages == "" {
limitOcrPages = 5
} else {
var err error
limitOcrPages, err = strconv.Atoi(rawLimitOcrPages)
if err != nil {
log.Fatalf("Invalid OCR_LIMIT_PAGES value: %v", err)
}
}
}
} }
// processAutoTagDocuments handles the background auto-tagging of documents // processAutoTagDocuments handles the background auto-tagging of documents

2
ocr.go
View file

@ -9,7 +9,7 @@ import (
// ProcessDocumentOCR processes a document through OCR and returns the combined text // ProcessDocumentOCR processes a document through OCR and returns the combined text
func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) { func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID) imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID, limitOcrPages)
defer func() { defer func() {
for _, imagePath := range imagePaths { for _, imagePath := range imagePaths {
os.Remove(imagePath) os.Remove(imagePath)

View file

@ -391,7 +391,8 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum
} }
// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images // DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int) ([]string, error) { // If limitPages > 0, only the first N pages will be processed
func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int, limitPages int) ([]string, error) {
// Create a directory named after the document ID // Create a directory named after the document ID
docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", documentId)) docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", documentId))
if _, err := os.Stat(docDir); os.IsNotExist(err) { if _, err := os.Stat(docDir); os.IsNotExist(err) {
@ -404,6 +405,9 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
// Check if images already exist // Check if images already exist
var imagePaths []string var imagePaths []string
for n := 0; ; n++ { for n := 0; ; n++ {
if limitPages > 0 && n >= limitPages {
break
}
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n)) imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
if _, err := os.Stat(imagePath); os.IsNotExist(err) { if _, err := os.Stat(imagePath); os.IsNotExist(err) {
break break
@ -452,10 +456,15 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
} }
defer doc.Close() defer doc.Close()
totalPages := doc.NumPage()
if limitPages > 0 && limitPages < totalPages {
totalPages = limitPages
}
var mu sync.Mutex var mu sync.Mutex
var g errgroup.Group var g errgroup.Group
for n := 0; n < doc.NumPage(); n++ { for n := 0; n < totalPages; n++ {
n := n // capture loop variable n := n // capture loop variable
g.Go(func() error { g.Go(func() error {
mu.Lock() mu.Lock()

View file

@ -385,7 +385,7 @@ func TestDownloadDocumentAsImages(t *testing.T) {
}) })
ctx := context.Background() ctx := context.Background()
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID) imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 0)
require.NoError(t, err) require.NoError(t, err)
// Verify that exatly one page was extracted // Verify that exatly one page was extracted
@ -422,11 +422,11 @@ func TestDownloadDocumentAsImages_ManyPages(t *testing.T) {
env.client.CacheFolder = "tests/tmp" env.client.CacheFolder = "tests/tmp"
// Clean the cache folder // Clean the cache folder
os.RemoveAll(env.client.CacheFolder) os.RemoveAll(env.client.CacheFolder)
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID) imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 50)
require.NoError(t, err) require.NoError(t, err)
// Verify that exatly 52 pages were extracted // Verify that exatly 50 pages were extracted - the original doc contains 52 pages
assert.Len(t, imagePaths, 52) assert.Len(t, imagePaths, 50)
// The path shall end with tests/tmp/document-321/page000.jpg // The path shall end with tests/tmp/document-321/page000.jpg
for _, imagePath := range imagePaths { for _, imagePath := range imagePaths {
_, err := os.Stat(imagePath) _, err := os.Stat(imagePath)