mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-13 05:08:01 -05:00
Limit pages for OCR
Bonus: Adds logging for image dimensions and size Closes #95
This commit is contained in:
parent
d72fbcb527
commit
ebc26b5bdf
6 changed files with 48 additions and 7 deletions
|
@ -97,6 +97,7 @@ services:
|
|||
VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama
|
||||
VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc.
|
||||
AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto
|
||||
OCR_LIMIT_PAGES: '5' # Optional, default: 5. Set to 0 for no limit.
|
||||
LOG_LEVEL: 'info' # Optional: debug, warn, error
|
||||
volumes:
|
||||
- ./prompts:/app/prompts # Mount the prompts directory
|
||||
|
@ -166,6 +167,7 @@ services:
|
|||
| `WEBUI_PATH` | Path for static content. Default: `./web-app/dist`. | No |
|
||||
| `AUTO_GENERATE_TITLE` | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
|
||||
| `AUTO_GENERATE_TAGS` | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
|
||||
| `OCR_LIMIT_PAGES` | Limit the number of pages for OCR. Set to `0` for no limit. Default: `5`. | No |
|
||||
|
||||
### Custom Prompt Templates
|
||||
|
||||
|
|
15
app_llm.go
15
app_llm.go
|
@ -5,9 +5,12 @@ import (
|
|||
"context"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"image"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
_ "image/jpeg"
|
||||
|
||||
"github.com/tmc/langchaingo/llms"
|
||||
)
|
||||
|
||||
|
@ -82,15 +85,27 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
|
|||
|
||||
prompt := promptBuffer.String()
|
||||
|
||||
// Log the image dimensions
|
||||
img, _, err := image.Decode(bytes.NewReader(jpegBytes))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error decoding image: %v", err)
|
||||
}
|
||||
bounds := img.Bounds()
|
||||
log.Debugf("Image dimensions: %dx%d", bounds.Dx(), bounds.Dy())
|
||||
|
||||
// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
|
||||
var parts []llms.ContentPart
|
||||
if strings.ToLower(visionLlmProvider) != "openai" {
|
||||
// Log image size in kilobytes
|
||||
log.Debugf("Image size: %d KB", len(jpegBytes)/1024)
|
||||
parts = []llms.ContentPart{
|
||||
llms.BinaryPart("image/jpeg", jpegBytes),
|
||||
llms.TextPart(prompt),
|
||||
}
|
||||
} else {
|
||||
base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
|
||||
// Log image size in kilobytes
|
||||
log.Debugf("Image size: %d KB", len(base64Image)/1024)
|
||||
parts = []llms.ContentPart{
|
||||
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
|
||||
llms.TextPart(prompt),
|
||||
|
|
15
main.go
15
main.go
|
@ -6,6 +6,7 @@ import (
|
|||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"text/template"
|
||||
|
@ -43,6 +44,7 @@ var (
|
|||
webuiPath = os.Getenv("WEBUI_PATH")
|
||||
autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE")
|
||||
autoGenerateTags = os.Getenv("AUTO_GENERATE_TAGS")
|
||||
limitOcrPages int // Will be read from OCR_LIMIT_PAGES
|
||||
|
||||
// Templates
|
||||
titleTemplate *template.Template
|
||||
|
@ -309,6 +311,19 @@ func validateOrDefaultEnvVars() {
|
|||
if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" {
|
||||
log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.")
|
||||
}
|
||||
|
||||
if isOcrEnabled() {
|
||||
rawLimitOcrPages := os.Getenv("OCR_LIMIT_PAGES")
|
||||
if rawLimitOcrPages == "" {
|
||||
limitOcrPages = 5
|
||||
} else {
|
||||
var err error
|
||||
limitOcrPages, err = strconv.Atoi(rawLimitOcrPages)
|
||||
if err != nil {
|
||||
log.Fatalf("Invalid OCR_LIMIT_PAGES value: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// processAutoTagDocuments handles the background auto-tagging of documents
|
||||
|
|
2
ocr.go
2
ocr.go
|
@ -9,7 +9,7 @@ import (
|
|||
|
||||
// ProcessDocumentOCR processes a document through OCR and returns the combined text
|
||||
func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
|
||||
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID)
|
||||
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID, limitOcrPages)
|
||||
defer func() {
|
||||
for _, imagePath := range imagePaths {
|
||||
os.Remove(imagePath)
|
||||
|
|
13
paperless.go
13
paperless.go
|
@ -391,7 +391,8 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum
|
|||
}
|
||||
|
||||
// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
|
||||
func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int) ([]string, error) {
|
||||
// If limitPages > 0, only the first N pages will be processed
|
||||
func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int, limitPages int) ([]string, error) {
|
||||
// Create a directory named after the document ID
|
||||
docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", documentId))
|
||||
if _, err := os.Stat(docDir); os.IsNotExist(err) {
|
||||
|
@ -404,6 +405,9 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
|
|||
// Check if images already exist
|
||||
var imagePaths []string
|
||||
for n := 0; ; n++ {
|
||||
if limitPages > 0 && n >= limitPages {
|
||||
break
|
||||
}
|
||||
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
|
||||
if _, err := os.Stat(imagePath); os.IsNotExist(err) {
|
||||
break
|
||||
|
@ -452,10 +456,15 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
|
|||
}
|
||||
defer doc.Close()
|
||||
|
||||
totalPages := doc.NumPage()
|
||||
if limitPages > 0 && limitPages < totalPages {
|
||||
totalPages = limitPages
|
||||
}
|
||||
|
||||
var mu sync.Mutex
|
||||
var g errgroup.Group
|
||||
|
||||
for n := 0; n < doc.NumPage(); n++ {
|
||||
for n := 0; n < totalPages; n++ {
|
||||
n := n // capture loop variable
|
||||
g.Go(func() error {
|
||||
mu.Lock()
|
||||
|
|
|
@ -385,7 +385,7 @@ func TestDownloadDocumentAsImages(t *testing.T) {
|
|||
})
|
||||
|
||||
ctx := context.Background()
|
||||
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID)
|
||||
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify that exatly one page was extracted
|
||||
|
@ -422,11 +422,11 @@ func TestDownloadDocumentAsImages_ManyPages(t *testing.T) {
|
|||
env.client.CacheFolder = "tests/tmp"
|
||||
// Clean the cache folder
|
||||
os.RemoveAll(env.client.CacheFolder)
|
||||
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID)
|
||||
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 50)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify that exatly 52 pages were extracted
|
||||
assert.Len(t, imagePaths, 52)
|
||||
// Verify that exatly 50 pages were extracted - the original doc contains 52 pages
|
||||
assert.Len(t, imagePaths, 50)
|
||||
// The path shall end with tests/tmp/document-321/page000.jpg
|
||||
for _, imagePath := range imagePaths {
|
||||
_, err := os.Stat(imagePath)
|
||||
|
|
Loading…
Reference in a new issue