mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-12 12:58:02 -05:00
Limit pages for OCR (#102)
Bonus: Adds logging for image dimensions and size Closes #95
This commit is contained in:
parent
d72fbcb527
commit
38321c4ac9
6 changed files with 48 additions and 7 deletions
|
@ -97,6 +97,7 @@ services:
|
||||||
VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama
|
VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama
|
||||||
VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc.
|
VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc.
|
||||||
AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto
|
AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto
|
||||||
|
OCR_LIMIT_PAGES: '5' # Optional, default: 5. Set to 0 for no limit.
|
||||||
LOG_LEVEL: 'info' # Optional: debug, warn, error
|
LOG_LEVEL: 'info' # Optional: debug, warn, error
|
||||||
volumes:
|
volumes:
|
||||||
- ./prompts:/app/prompts # Mount the prompts directory
|
- ./prompts:/app/prompts # Mount the prompts directory
|
||||||
|
@ -166,6 +167,7 @@ services:
|
||||||
| `WEBUI_PATH` | Path for static content. Default: `./web-app/dist`. | No |
|
| `WEBUI_PATH` | Path for static content. Default: `./web-app/dist`. | No |
|
||||||
| `AUTO_GENERATE_TITLE` | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
|
| `AUTO_GENERATE_TITLE` | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
|
||||||
| `AUTO_GENERATE_TAGS` | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
|
| `AUTO_GENERATE_TAGS` | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
|
||||||
|
| `OCR_LIMIT_PAGES` | Limit the number of pages for OCR. Set to `0` for no limit. Default: `5`. | No |
|
||||||
|
|
||||||
### Custom Prompt Templates
|
### Custom Prompt Templates
|
||||||
|
|
||||||
|
|
15
app_llm.go
15
app_llm.go
|
@ -5,9 +5,12 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"image"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
_ "image/jpeg"
|
||||||
|
|
||||||
"github.com/tmc/langchaingo/llms"
|
"github.com/tmc/langchaingo/llms"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -82,15 +85,27 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
|
||||||
|
|
||||||
prompt := promptBuffer.String()
|
prompt := promptBuffer.String()
|
||||||
|
|
||||||
|
// Log the image dimensions
|
||||||
|
img, _, err := image.Decode(bytes.NewReader(jpegBytes))
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error decoding image: %v", err)
|
||||||
|
}
|
||||||
|
bounds := img.Bounds()
|
||||||
|
log.Debugf("Image dimensions: %dx%d", bounds.Dx(), bounds.Dy())
|
||||||
|
|
||||||
// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
|
// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
|
||||||
var parts []llms.ContentPart
|
var parts []llms.ContentPart
|
||||||
if strings.ToLower(visionLlmProvider) != "openai" {
|
if strings.ToLower(visionLlmProvider) != "openai" {
|
||||||
|
// Log image size in kilobytes
|
||||||
|
log.Debugf("Image size: %d KB", len(jpegBytes)/1024)
|
||||||
parts = []llms.ContentPart{
|
parts = []llms.ContentPart{
|
||||||
llms.BinaryPart("image/jpeg", jpegBytes),
|
llms.BinaryPart("image/jpeg", jpegBytes),
|
||||||
llms.TextPart(prompt),
|
llms.TextPart(prompt),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
|
base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
|
||||||
|
// Log image size in kilobytes
|
||||||
|
log.Debugf("Image size: %d KB", len(base64Image)/1024)
|
||||||
parts = []llms.ContentPart{
|
parts = []llms.ContentPart{
|
||||||
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
|
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
|
||||||
llms.TextPart(prompt),
|
llms.TextPart(prompt),
|
||||||
|
|
15
main.go
15
main.go
|
@ -6,6 +6,7 @@ import (
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"text/template"
|
"text/template"
|
||||||
|
@ -43,6 +44,7 @@ var (
|
||||||
webuiPath = os.Getenv("WEBUI_PATH")
|
webuiPath = os.Getenv("WEBUI_PATH")
|
||||||
autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE")
|
autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE")
|
||||||
autoGenerateTags = os.Getenv("AUTO_GENERATE_TAGS")
|
autoGenerateTags = os.Getenv("AUTO_GENERATE_TAGS")
|
||||||
|
limitOcrPages int // Will be read from OCR_LIMIT_PAGES
|
||||||
|
|
||||||
// Templates
|
// Templates
|
||||||
titleTemplate *template.Template
|
titleTemplate *template.Template
|
||||||
|
@ -309,6 +311,19 @@ func validateOrDefaultEnvVars() {
|
||||||
if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" {
|
if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" {
|
||||||
log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.")
|
log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if isOcrEnabled() {
|
||||||
|
rawLimitOcrPages := os.Getenv("OCR_LIMIT_PAGES")
|
||||||
|
if rawLimitOcrPages == "" {
|
||||||
|
limitOcrPages = 5
|
||||||
|
} else {
|
||||||
|
var err error
|
||||||
|
limitOcrPages, err = strconv.Atoi(rawLimitOcrPages)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Invalid OCR_LIMIT_PAGES value: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// processAutoTagDocuments handles the background auto-tagging of documents
|
// processAutoTagDocuments handles the background auto-tagging of documents
|
||||||
|
|
2
ocr.go
2
ocr.go
|
@ -9,7 +9,7 @@ import (
|
||||||
|
|
||||||
// ProcessDocumentOCR processes a document through OCR and returns the combined text
|
// ProcessDocumentOCR processes a document through OCR and returns the combined text
|
||||||
func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
|
func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
|
||||||
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID)
|
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID, limitOcrPages)
|
||||||
defer func() {
|
defer func() {
|
||||||
for _, imagePath := range imagePaths {
|
for _, imagePath := range imagePaths {
|
||||||
os.Remove(imagePath)
|
os.Remove(imagePath)
|
||||||
|
|
13
paperless.go
13
paperless.go
|
@ -391,7 +391,8 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum
|
||||||
}
|
}
|
||||||
|
|
||||||
// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
|
// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
|
||||||
func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int) ([]string, error) {
|
// If limitPages > 0, only the first N pages will be processed
|
||||||
|
func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int, limitPages int) ([]string, error) {
|
||||||
// Create a directory named after the document ID
|
// Create a directory named after the document ID
|
||||||
docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", documentId))
|
docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", documentId))
|
||||||
if _, err := os.Stat(docDir); os.IsNotExist(err) {
|
if _, err := os.Stat(docDir); os.IsNotExist(err) {
|
||||||
|
@ -404,6 +405,9 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
|
||||||
// Check if images already exist
|
// Check if images already exist
|
||||||
var imagePaths []string
|
var imagePaths []string
|
||||||
for n := 0; ; n++ {
|
for n := 0; ; n++ {
|
||||||
|
if limitPages > 0 && n >= limitPages {
|
||||||
|
break
|
||||||
|
}
|
||||||
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
|
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
|
||||||
if _, err := os.Stat(imagePath); os.IsNotExist(err) {
|
if _, err := os.Stat(imagePath); os.IsNotExist(err) {
|
||||||
break
|
break
|
||||||
|
@ -452,10 +456,15 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
|
||||||
}
|
}
|
||||||
defer doc.Close()
|
defer doc.Close()
|
||||||
|
|
||||||
|
totalPages := doc.NumPage()
|
||||||
|
if limitPages > 0 && limitPages < totalPages {
|
||||||
|
totalPages = limitPages
|
||||||
|
}
|
||||||
|
|
||||||
var mu sync.Mutex
|
var mu sync.Mutex
|
||||||
var g errgroup.Group
|
var g errgroup.Group
|
||||||
|
|
||||||
for n := 0; n < doc.NumPage(); n++ {
|
for n := 0; n < totalPages; n++ {
|
||||||
n := n // capture loop variable
|
n := n // capture loop variable
|
||||||
g.Go(func() error {
|
g.Go(func() error {
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
|
|
|
@ -385,7 +385,7 @@ func TestDownloadDocumentAsImages(t *testing.T) {
|
||||||
})
|
})
|
||||||
|
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID)
|
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 0)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
// Verify that exatly one page was extracted
|
// Verify that exatly one page was extracted
|
||||||
|
@ -422,11 +422,11 @@ func TestDownloadDocumentAsImages_ManyPages(t *testing.T) {
|
||||||
env.client.CacheFolder = "tests/tmp"
|
env.client.CacheFolder = "tests/tmp"
|
||||||
// Clean the cache folder
|
// Clean the cache folder
|
||||||
os.RemoveAll(env.client.CacheFolder)
|
os.RemoveAll(env.client.CacheFolder)
|
||||||
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID)
|
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 50)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
// Verify that exatly 52 pages were extracted
|
// Verify that exatly 50 pages were extracted - the original doc contains 52 pages
|
||||||
assert.Len(t, imagePaths, 52)
|
assert.Len(t, imagePaths, 50)
|
||||||
// The path shall end with tests/tmp/document-321/page000.jpg
|
// The path shall end with tests/tmp/document-321/page000.jpg
|
||||||
for _, imagePath := range imagePaths {
|
for _, imagePath := range imagePaths {
|
||||||
_, err := os.Stat(imagePath)
|
_, err := os.Stat(imagePath)
|
||||||
|
|
Loading…
Reference in a new issue