mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-13 05:08:01 -05:00
feat(ocr): enhance OCR processing with structured results and hOCR support
This commit is contained in:
parent
18ba388af1
commit
d71d340eb5
6 changed files with 311 additions and 16 deletions
|
@ -92,3 +92,88 @@
|
||||||
- E2E tests for web interface
|
- E2E tests for web interface
|
||||||
- Test fixtures and mocks
|
- Test fixtures and mocks
|
||||||
- Playwright for frontend testing
|
- Playwright for frontend testing
|
||||||
|
|
||||||
|
## OCR System Patterns
|
||||||
|
|
||||||
|
### OCR Provider Architecture
|
||||||
|
|
||||||
|
#### 1. Provider Interface
|
||||||
|
- Common interface for all OCR implementations
|
||||||
|
- Methods for image processing
|
||||||
|
- Configuration through standardized Config struct
|
||||||
|
- Resource management patterns
|
||||||
|
|
||||||
|
#### 2. LLM Provider Implementation
|
||||||
|
- Supports OpenAI and Ollama vision models
|
||||||
|
- Base64 encoding for OpenAI requests
|
||||||
|
- Binary format for Ollama requests
|
||||||
|
- Template-based OCR prompts
|
||||||
|
|
||||||
|
#### 3. Google Document AI Provider
|
||||||
|
- Enterprise-grade OCR processing
|
||||||
|
- MIME type validation
|
||||||
|
- Processor configuration via environment
|
||||||
|
- Regional endpoint support
|
||||||
|
|
||||||
|
### Logging Patterns
|
||||||
|
|
||||||
|
#### 1. Provider Initialization
|
||||||
|
```
|
||||||
|
[INFO] Initializing OCR provider: llm
|
||||||
|
[INFO] Using LLM OCR provider (provider=ollama, model=minicpm-v)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Processing Logs
|
||||||
|
```
|
||||||
|
[DEBUG] Starting OCR processing
|
||||||
|
[DEBUG] Image dimensions (width=800, height=1200)
|
||||||
|
[DEBUG] Using binary image format for non-OpenAI provider
|
||||||
|
[DEBUG] Sending request to vision model
|
||||||
|
[INFO] Successfully processed image (content_length=1536)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Error Logging
|
||||||
|
```
|
||||||
|
[ERROR] Failed to decode image: invalid format
|
||||||
|
[ERROR] Unsupported file type: image/webp
|
||||||
|
[ERROR] Failed to get response from vision model
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Handling Patterns
|
||||||
|
|
||||||
|
#### 1. Configuration Validation
|
||||||
|
- Required parameter checks
|
||||||
|
- Environment variable validation
|
||||||
|
- Provider-specific configuration
|
||||||
|
- Connection testing
|
||||||
|
|
||||||
|
#### 2. Processing Errors
|
||||||
|
- Image format validation
|
||||||
|
- MIME type checking
|
||||||
|
- Content processing errors
|
||||||
|
- Provider-specific error handling
|
||||||
|
|
||||||
|
#### 3. Error Propagation
|
||||||
|
- Detailed error contexts
|
||||||
|
- Original error wrapping
|
||||||
|
- Logging with error context
|
||||||
|
- Recovery mechanisms
|
||||||
|
|
||||||
|
### Processing Flow
|
||||||
|
|
||||||
|
#### 1. Document Processing
|
||||||
|
```
|
||||||
|
Document Tagged → OCR Provider Selected → Image Processing → Text Extraction → Content Update
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Provider Selection
|
||||||
|
```
|
||||||
|
Config Check → Provider Initialization → Resource Setup → Provider Ready
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Error Recovery
|
||||||
|
```
|
||||||
|
Error Detection → Logging → Cleanup → Error Propagation
|
||||||
|
```
|
||||||
|
|
||||||
|
These patterns ensure consistent behavior across OCR providers while maintaining proper logging and error handling throughout the system.
|
||||||
|
|
13
ocr.go
13
ocr.go
|
@ -36,13 +36,20 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string,
|
||||||
return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
|
return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent)
|
result, err := app.ocrProvider.ProcessImage(ctx, imageContent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
|
return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
|
||||||
}
|
}
|
||||||
pageLogger.Debug("OCR completed for page")
|
if result == nil {
|
||||||
|
pageLogger.Error("Got nil result from OCR provider")
|
||||||
|
return "", fmt.Errorf("error performing OCR for document %d, page %d: nil result", documentID, i+1)
|
||||||
|
}
|
||||||
|
|
||||||
ocrTexts = append(ocrTexts, ocrText)
|
pageLogger.WithField("has_hocr", result.HOCR != "").
|
||||||
|
WithField("metadata", result.Metadata).
|
||||||
|
Debug("OCR completed for page")
|
||||||
|
|
||||||
|
ocrTexts = append(ocrTexts, result.Text)
|
||||||
}
|
}
|
||||||
|
|
||||||
docLogger.Info("OCR processing completed successfully")
|
docLogger.Info("OCR processing completed successfully")
|
||||||
|
|
|
@ -3,6 +3,7 @@ package ocr
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
documentai "cloud.google.com/go/documentai/apiv1"
|
documentai "cloud.google.com/go/documentai/apiv1"
|
||||||
"cloud.google.com/go/documentai/apiv1/documentaipb"
|
"cloud.google.com/go/documentai/apiv1/documentaipb"
|
||||||
|
@ -46,7 +47,7 @@ func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
|
||||||
return provider, nil
|
return provider, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
|
||||||
logger := log.WithFields(logrus.Fields{
|
logger := log.WithFields(logrus.Fields{
|
||||||
"project_id": p.projectID,
|
"project_id": p.projectID,
|
||||||
"location": p.location,
|
"location": p.location,
|
||||||
|
@ -60,7 +61,7 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
|
||||||
|
|
||||||
if !isImageMIMEType(mtype.String()) {
|
if !isImageMIMEType(mtype.String()) {
|
||||||
logger.WithField("mime_type", mtype.String()).Error("Unsupported file type")
|
logger.WithField("mime_type", mtype.String()).Error("Unsupported file type")
|
||||||
return "", fmt.Errorf("unsupported file type: %s", mtype.String())
|
return nil, fmt.Errorf("unsupported file type: %s", mtype.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID)
|
name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID)
|
||||||
|
@ -79,21 +80,40 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
|
||||||
resp, err := p.client.ProcessDocument(ctx, req)
|
resp, err := p.client.ProcessDocument(ctx, req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.WithError(err).Error("Failed to process document")
|
logger.WithError(err).Error("Failed to process document")
|
||||||
return "", fmt.Errorf("error processing document: %w", err)
|
return nil, fmt.Errorf("error processing document: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp == nil || resp.Document == nil {
|
if resp == nil || resp.Document == nil {
|
||||||
logger.Error("Received nil response or document from Document AI")
|
logger.Error("Received nil response or document from Document AI")
|
||||||
return "", fmt.Errorf("received nil response or document from Document AI")
|
return nil, fmt.Errorf("received nil response or document from Document AI")
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.Document.Error != nil {
|
if resp.Document.Error != nil {
|
||||||
logger.WithField("error", resp.Document.Error.Message).Error("Document processing error")
|
logger.WithField("error", resp.Document.Error.Message).Error("Document processing error")
|
||||||
return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
|
return nil, fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document")
|
result := &OCRResult{
|
||||||
return resp.Document.Text, nil
|
Text: resp.Document.Text,
|
||||||
|
Metadata: map[string]string{
|
||||||
|
"provider": "google_docai",
|
||||||
|
"mime_type": mtype.String(),
|
||||||
|
"lang_code": resp.Document.GetPages()[0].GetDetectedLanguages()[0].GetLanguageCode(),
|
||||||
|
"page_count": fmt.Sprintf("%d", len(resp.Document.GetPages())),
|
||||||
|
"processor_id": p.processorID,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add hOCR output if available
|
||||||
|
if len(resp.Document.GetPages()) > 0 {
|
||||||
|
hocr := generateHOCR(resp.Document)
|
||||||
|
if hocr != "" {
|
||||||
|
result.HOCR = hocr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.WithField("content_length", len(result.Text)).Info("Successfully processed document")
|
||||||
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// isImageMIMEType checks if the given MIME type is a supported image type
|
// isImageMIMEType checks if the given MIME type is a supported image type
|
||||||
|
@ -109,6 +129,68 @@ func isImageMIMEType(mimeType string) bool {
|
||||||
return supportedTypes[mimeType]
|
return supportedTypes[mimeType]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// generateHOCR converts Document AI response to hOCR format
|
||||||
|
func generateHOCR(doc *documentaipb.Document) string {
|
||||||
|
if len(doc.GetPages()) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
var hocr strings.Builder
|
||||||
|
hocr.WriteString(`<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||||
|
<head>
|
||||||
|
<title>OCR Output</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
||||||
|
<meta name='ocr-system' content='google-docai' />
|
||||||
|
</head>
|
||||||
|
<body>`)
|
||||||
|
|
||||||
|
for pageNum, page := range doc.GetPages() {
|
||||||
|
pageWidth := page.GetDimension().GetWidth()
|
||||||
|
pageHeight := page.GetDimension().GetHeight()
|
||||||
|
|
||||||
|
hocr.WriteString(fmt.Sprintf(`
|
||||||
|
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
|
||||||
|
pageNum+1, int(pageWidth), int(pageHeight)))
|
||||||
|
|
||||||
|
// Process paragraphs
|
||||||
|
for _, para := range page.GetParagraphs() {
|
||||||
|
paraBox := para.GetLayout().GetBoundingPoly().GetNormalizedVertices()
|
||||||
|
if len(paraBox) < 4 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert normalized coordinates to absolute
|
||||||
|
x1 := int(paraBox[0].GetX() * pageWidth)
|
||||||
|
y1 := int(paraBox[0].GetY() * pageHeight)
|
||||||
|
x2 := int(paraBox[2].GetX() * pageWidth)
|
||||||
|
y2 := int(paraBox[2].GetY() * pageHeight)
|
||||||
|
|
||||||
|
hocr.WriteString(fmt.Sprintf(`
|
||||||
|
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
|
||||||
|
pageNum+1, len(page.GetParagraphs()), x1, y1, x2, y2))
|
||||||
|
|
||||||
|
// Process words within paragraph
|
||||||
|
for _, token := range para.GetLayout().GetTextAnchor().GetTextSegments() {
|
||||||
|
text := doc.Text[token.GetStartIndex():token.GetEndIndex()]
|
||||||
|
if text == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
hocr.WriteString(fmt.Sprintf(`
|
||||||
|
<span class='ocrx_word'>%s</span>`, text))
|
||||||
|
}
|
||||||
|
|
||||||
|
hocr.WriteString("\n </p>")
|
||||||
|
}
|
||||||
|
hocr.WriteString("\n </div>")
|
||||||
|
}
|
||||||
|
|
||||||
|
hocr.WriteString("\n</body>\n</html>")
|
||||||
|
return hocr.String()
|
||||||
|
}
|
||||||
|
|
||||||
// Close releases resources used by the provider
|
// Close releases resources used by the provider
|
||||||
func (p *GoogleDocAIProvider) Close() error {
|
func (p *GoogleDocAIProvider) Close() error {
|
||||||
if p.client != nil {
|
if p.client != nil {
|
||||||
|
|
99
ocr/google_docai_provider_test.go
Normal file
99
ocr/google_docai_provider_test.go
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
package ocr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"cloud.google.com/go/documentai/apiv1/documentaipb"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGenerateHOCR(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
doc *documentaipb.Document
|
||||||
|
expected string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "empty document",
|
||||||
|
doc: &documentaipb.Document{},
|
||||||
|
expected: "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "single page with one paragraph",
|
||||||
|
doc: &documentaipb.Document{
|
||||||
|
Text: "Hello World",
|
||||||
|
Pages: []*documentaipb.Document_Page{
|
||||||
|
{
|
||||||
|
Dimension: &documentaipb.Document_Page_Dimension{
|
||||||
|
Width: 800,
|
||||||
|
Height: 600,
|
||||||
|
},
|
||||||
|
Paragraphs: []*documentaipb.Document_Page_Paragraph{
|
||||||
|
{
|
||||||
|
Layout: &documentaipb.Document_Page_Layout{
|
||||||
|
BoundingPoly: &documentaipb.BoundingPoly{
|
||||||
|
NormalizedVertices: []*documentaipb.NormalizedVertex{
|
||||||
|
{X: 0.1, Y: 0.1},
|
||||||
|
{X: 0.9, Y: 0.1},
|
||||||
|
{X: 0.9, Y: 0.2},
|
||||||
|
{X: 0.1, Y: 0.2},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
TextAnchor: &documentaipb.Document_TextAnchor{
|
||||||
|
TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{
|
||||||
|
{
|
||||||
|
StartIndex: 0,
|
||||||
|
EndIndex: 11,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expected: "(?s).*<div class='ocr_page' id='page_1' title='image;bbox 0 0 800 600'>.*" +
|
||||||
|
"<p class='ocr_par' id='par_1_1' title='bbox 80 60 720 120'>.*" +
|
||||||
|
"<span class='ocrx_word'>Hello World</span>.*</p>.*</div>.*",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
result := generateHOCR(tt.doc)
|
||||||
|
|
||||||
|
if tt.expected == "" {
|
||||||
|
if result != "" {
|
||||||
|
t.Errorf("expected empty string, got %v", result)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
matched, err := regexp.MatchString(tt.expected, result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("error matching regex: %v", err)
|
||||||
|
}
|
||||||
|
if !matched {
|
||||||
|
t.Errorf("expected to match regex %v\ngot: %v", tt.expected, result)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify basic hOCR structure
|
||||||
|
if !strings.Contains(result, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") {
|
||||||
|
t.Error("missing XML declaration")
|
||||||
|
}
|
||||||
|
if !strings.Contains(result, "<html xmlns=\"http://www.w3.org/1999/xhtml\"") {
|
||||||
|
t.Error("missing HTML namespace")
|
||||||
|
}
|
||||||
|
if !strings.Contains(result, "<meta name='ocr-system' content='google-docai'") {
|
||||||
|
t.Error("missing OCR system metadata")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func testContext() context.Context {
|
||||||
|
return context.Background()
|
||||||
|
}
|
|
@ -60,7 +60,7 @@ func newLLMProvider(config Config) (*LLMProvider, error) {
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
|
||||||
logger := log.WithFields(logrus.Fields{
|
logger := log.WithFields(logrus.Fields{
|
||||||
"provider": p.provider,
|
"provider": p.provider,
|
||||||
"model": p.model,
|
"model": p.model,
|
||||||
|
@ -71,7 +71,7 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
|
||||||
img, _, err := image.Decode(bytes.NewReader(imageContent))
|
img, _, err := image.Decode(bytes.NewReader(imageContent))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.WithError(err).Error("Failed to decode image")
|
logger.WithError(err).Error("Failed to decode image")
|
||||||
return "", fmt.Errorf("error decoding image: %w", err)
|
return nil, fmt.Errorf("error decoding image: %w", err)
|
||||||
}
|
}
|
||||||
bounds := img.Bounds()
|
bounds := img.Bounds()
|
||||||
logger.WithFields(logrus.Fields{
|
logger.WithFields(logrus.Fields{
|
||||||
|
@ -106,11 +106,18 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.WithError(err).Error("Failed to get response from vision model")
|
logger.WithError(err).Error("Failed to get response from vision model")
|
||||||
return "", fmt.Errorf("error getting response from LLM: %w", err)
|
return nil, fmt.Errorf("error getting response from LLM: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.WithField("content_length", len(completion.Choices[0].Content)).Info("Successfully processed image")
|
result := &OCRResult{
|
||||||
return completion.Choices[0].Content, nil
|
Text: completion.Choices[0].Content,
|
||||||
|
Metadata: map[string]string{
|
||||||
|
"provider": p.provider,
|
||||||
|
"model": p.model,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
logger.WithField("content_length", len(result.Text)).Info("Successfully processed image")
|
||||||
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// createOpenAIClient creates a new OpenAI vision model client
|
// createOpenAIClient creates a new OpenAI vision model client
|
||||||
|
|
|
@ -9,9 +9,21 @@ import (
|
||||||
|
|
||||||
var log = logrus.New()
|
var log = logrus.New()
|
||||||
|
|
||||||
|
// OCRResult holds the output from OCR processing
|
||||||
|
type OCRResult struct {
|
||||||
|
// Plain text output (required)
|
||||||
|
Text string
|
||||||
|
|
||||||
|
// hOCR output (optional, if provider supports it)
|
||||||
|
HOCR string
|
||||||
|
|
||||||
|
// Additional provider-specific metadata
|
||||||
|
Metadata map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
// Provider defines the interface for OCR processing
|
// Provider defines the interface for OCR processing
|
||||||
type Provider interface {
|
type Provider interface {
|
||||||
ProcessImage(ctx context.Context, imageContent []byte) (string, error)
|
ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Config holds the OCR provider configuration
|
// Config holds the OCR provider configuration
|
||||||
|
@ -27,6 +39,9 @@ type Config struct {
|
||||||
// LLM settings (from existing config)
|
// LLM settings (from existing config)
|
||||||
VisionLLMProvider string
|
VisionLLMProvider string
|
||||||
VisionLLMModel string
|
VisionLLMModel string
|
||||||
|
|
||||||
|
// OCR output options
|
||||||
|
EnableHOCR bool // Whether to request hOCR output if supported by the provider
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewProvider creates a new OCR provider based on configuration
|
// NewProvider creates a new OCR provider based on configuration
|
||||||
|
|
Loading…
Reference in a new issue