feat(ocr): enhance OCR processing with structured results and hOCR support

This commit is contained in:
Dominik Schröter 2025-02-10 16:31:51 +01:00
parent 18ba388af1
commit d71d340eb5
6 changed files with 311 additions and 16 deletions

View file

@ -92,3 +92,88 @@
- E2E tests for web interface - E2E tests for web interface
- Test fixtures and mocks - Test fixtures and mocks
- Playwright for frontend testing - Playwright for frontend testing
## OCR System Patterns
### OCR Provider Architecture
#### 1. Provider Interface
- Common interface for all OCR implementations
- Methods for image processing
- Configuration through standardized Config struct
- Resource management patterns
#### 2. LLM Provider Implementation
- Supports OpenAI and Ollama vision models
- Base64 encoding for OpenAI requests
- Binary format for Ollama requests
- Template-based OCR prompts
#### 3. Google Document AI Provider
- Enterprise-grade OCR processing
- MIME type validation
- Processor configuration via environment
- Regional endpoint support
### Logging Patterns
#### 1. Provider Initialization
```
[INFO] Initializing OCR provider: llm
[INFO] Using LLM OCR provider (provider=ollama, model=minicpm-v)
```
#### 2. Processing Logs
```
[DEBUG] Starting OCR processing
[DEBUG] Image dimensions (width=800, height=1200)
[DEBUG] Using binary image format for non-OpenAI provider
[DEBUG] Sending request to vision model
[INFO] Successfully processed image (content_length=1536)
```
#### 3. Error Logging
```
[ERROR] Failed to decode image: invalid format
[ERROR] Unsupported file type: image/webp
[ERROR] Failed to get response from vision model
```
### Error Handling Patterns
#### 1. Configuration Validation
- Required parameter checks
- Environment variable validation
- Provider-specific configuration
- Connection testing
#### 2. Processing Errors
- Image format validation
- MIME type checking
- Content processing errors
- Provider-specific error handling
#### 3. Error Propagation
- Detailed error contexts
- Original error wrapping
- Logging with error context
- Recovery mechanisms
### Processing Flow
#### 1. Document Processing
```
Document Tagged → OCR Provider Selected → Image Processing → Text Extraction → Content Update
```
#### 2. Provider Selection
```
Config Check → Provider Initialization → Resource Setup → Provider Ready
```
#### 3. Error Recovery
```
Error Detection → Logging → Cleanup → Error Propagation
```
These patterns ensure consistent behavior across OCR providers while maintaining proper logging and error handling throughout the system.

13
ocr.go
View file

@ -36,13 +36,20 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string,
return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err) return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
} }
ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent) result, err := app.ocrProvider.ProcessImage(ctx, imageContent)
if err != nil { if err != nil {
return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err) return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
} }
pageLogger.Debug("OCR completed for page") if result == nil {
pageLogger.Error("Got nil result from OCR provider")
return "", fmt.Errorf("error performing OCR for document %d, page %d: nil result", documentID, i+1)
}
ocrTexts = append(ocrTexts, ocrText) pageLogger.WithField("has_hocr", result.HOCR != "").
WithField("metadata", result.Metadata).
Debug("OCR completed for page")
ocrTexts = append(ocrTexts, result.Text)
} }
docLogger.Info("OCR processing completed successfully") docLogger.Info("OCR processing completed successfully")

View file

@ -3,6 +3,7 @@ package ocr
import ( import (
"context" "context"
"fmt" "fmt"
"strings"
documentai "cloud.google.com/go/documentai/apiv1" documentai "cloud.google.com/go/documentai/apiv1"
"cloud.google.com/go/documentai/apiv1/documentaipb" "cloud.google.com/go/documentai/apiv1/documentaipb"
@ -46,7 +47,7 @@ func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
return provider, nil return provider, nil
} }
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) { func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
logger := log.WithFields(logrus.Fields{ logger := log.WithFields(logrus.Fields{
"project_id": p.projectID, "project_id": p.projectID,
"location": p.location, "location": p.location,
@ -60,7 +61,7 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
if !isImageMIMEType(mtype.String()) { if !isImageMIMEType(mtype.String()) {
logger.WithField("mime_type", mtype.String()).Error("Unsupported file type") logger.WithField("mime_type", mtype.String()).Error("Unsupported file type")
return "", fmt.Errorf("unsupported file type: %s", mtype.String()) return nil, fmt.Errorf("unsupported file type: %s", mtype.String())
} }
name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID) name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID)
@ -79,21 +80,40 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
resp, err := p.client.ProcessDocument(ctx, req) resp, err := p.client.ProcessDocument(ctx, req)
if err != nil { if err != nil {
logger.WithError(err).Error("Failed to process document") logger.WithError(err).Error("Failed to process document")
return "", fmt.Errorf("error processing document: %w", err) return nil, fmt.Errorf("error processing document: %w", err)
} }
if resp == nil || resp.Document == nil { if resp == nil || resp.Document == nil {
logger.Error("Received nil response or document from Document AI") logger.Error("Received nil response or document from Document AI")
return "", fmt.Errorf("received nil response or document from Document AI") return nil, fmt.Errorf("received nil response or document from Document AI")
} }
if resp.Document.Error != nil { if resp.Document.Error != nil {
logger.WithField("error", resp.Document.Error.Message).Error("Document processing error") logger.WithField("error", resp.Document.Error.Message).Error("Document processing error")
return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message) return nil, fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
} }
logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document") result := &OCRResult{
return resp.Document.Text, nil Text: resp.Document.Text,
Metadata: map[string]string{
"provider": "google_docai",
"mime_type": mtype.String(),
"lang_code": resp.Document.GetPages()[0].GetDetectedLanguages()[0].GetLanguageCode(),
"page_count": fmt.Sprintf("%d", len(resp.Document.GetPages())),
"processor_id": p.processorID,
},
}
// Add hOCR output if available
if len(resp.Document.GetPages()) > 0 {
hocr := generateHOCR(resp.Document)
if hocr != "" {
result.HOCR = hocr
}
}
logger.WithField("content_length", len(result.Text)).Info("Successfully processed document")
return result, nil
} }
// isImageMIMEType checks if the given MIME type is a supported image type // isImageMIMEType checks if the given MIME type is a supported image type
@ -109,6 +129,68 @@ func isImageMIMEType(mimeType string) bool {
return supportedTypes[mimeType] return supportedTypes[mimeType]
} }
// generateHOCR converts Document AI response to hOCR format
func generateHOCR(doc *documentaipb.Document) string {
if len(doc.GetPages()) == 0 {
return ""
}
var hocr strings.Builder
hocr.WriteString(`<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>OCR Output</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='google-docai' />
</head>
<body>`)
for pageNum, page := range doc.GetPages() {
pageWidth := page.GetDimension().GetWidth()
pageHeight := page.GetDimension().GetHeight()
hocr.WriteString(fmt.Sprintf(`
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
pageNum+1, int(pageWidth), int(pageHeight)))
// Process paragraphs
for _, para := range page.GetParagraphs() {
paraBox := para.GetLayout().GetBoundingPoly().GetNormalizedVertices()
if len(paraBox) < 4 {
continue
}
// Convert normalized coordinates to absolute
x1 := int(paraBox[0].GetX() * pageWidth)
y1 := int(paraBox[0].GetY() * pageHeight)
x2 := int(paraBox[2].GetX() * pageWidth)
y2 := int(paraBox[2].GetY() * pageHeight)
hocr.WriteString(fmt.Sprintf(`
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
pageNum+1, len(page.GetParagraphs()), x1, y1, x2, y2))
// Process words within paragraph
for _, token := range para.GetLayout().GetTextAnchor().GetTextSegments() {
text := doc.Text[token.GetStartIndex():token.GetEndIndex()]
if text == "" {
continue
}
hocr.WriteString(fmt.Sprintf(`
<span class='ocrx_word'>%s</span>`, text))
}
hocr.WriteString("\n </p>")
}
hocr.WriteString("\n </div>")
}
hocr.WriteString("\n</body>\n</html>")
return hocr.String()
}
// Close releases resources used by the provider // Close releases resources used by the provider
func (p *GoogleDocAIProvider) Close() error { func (p *GoogleDocAIProvider) Close() error {
if p.client != nil { if p.client != nil {

View file

@ -0,0 +1,99 @@
package ocr
import (
"context"
"regexp"
"strings"
"testing"
"cloud.google.com/go/documentai/apiv1/documentaipb"
)
func TestGenerateHOCR(t *testing.T) {
tests := []struct {
name string
doc *documentaipb.Document
expected string
}{
{
name: "empty document",
doc: &documentaipb.Document{},
expected: "",
},
{
name: "single page with one paragraph",
doc: &documentaipb.Document{
Text: "Hello World",
Pages: []*documentaipb.Document_Page{
{
Dimension: &documentaipb.Document_Page_Dimension{
Width: 800,
Height: 600,
},
Paragraphs: []*documentaipb.Document_Page_Paragraph{
{
Layout: &documentaipb.Document_Page_Layout{
BoundingPoly: &documentaipb.BoundingPoly{
NormalizedVertices: []*documentaipb.NormalizedVertex{
{X: 0.1, Y: 0.1},
{X: 0.9, Y: 0.1},
{X: 0.9, Y: 0.2},
{X: 0.1, Y: 0.2},
},
},
TextAnchor: &documentaipb.Document_TextAnchor{
TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{
{
StartIndex: 0,
EndIndex: 11,
},
},
},
},
},
},
},
},
},
expected: "(?s).*<div class='ocr_page' id='page_1' title='image;bbox 0 0 800 600'>.*" +
"<p class='ocr_par' id='par_1_1' title='bbox 80 60 720 120'>.*" +
"<span class='ocrx_word'>Hello World</span>.*</p>.*</div>.*",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := generateHOCR(tt.doc)
if tt.expected == "" {
if result != "" {
t.Errorf("expected empty string, got %v", result)
}
return
}
matched, err := regexp.MatchString(tt.expected, result)
if err != nil {
t.Fatalf("error matching regex: %v", err)
}
if !matched {
t.Errorf("expected to match regex %v\ngot: %v", tt.expected, result)
}
// Verify basic hOCR structure
if !strings.Contains(result, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") {
t.Error("missing XML declaration")
}
if !strings.Contains(result, "<html xmlns=\"http://www.w3.org/1999/xhtml\"") {
t.Error("missing HTML namespace")
}
if !strings.Contains(result, "<meta name='ocr-system' content='google-docai'") {
t.Error("missing OCR system metadata")
}
})
}
}
func testContext() context.Context {
return context.Background()
}

View file

@ -60,7 +60,7 @@ func newLLMProvider(config Config) (*LLMProvider, error) {
}, nil }, nil
} }
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) { func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
logger := log.WithFields(logrus.Fields{ logger := log.WithFields(logrus.Fields{
"provider": p.provider, "provider": p.provider,
"model": p.model, "model": p.model,
@ -71,7 +71,7 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
img, _, err := image.Decode(bytes.NewReader(imageContent)) img, _, err := image.Decode(bytes.NewReader(imageContent))
if err != nil { if err != nil {
logger.WithError(err).Error("Failed to decode image") logger.WithError(err).Error("Failed to decode image")
return "", fmt.Errorf("error decoding image: %w", err) return nil, fmt.Errorf("error decoding image: %w", err)
} }
bounds := img.Bounds() bounds := img.Bounds()
logger.WithFields(logrus.Fields{ logger.WithFields(logrus.Fields{
@ -106,11 +106,18 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
}) })
if err != nil { if err != nil {
logger.WithError(err).Error("Failed to get response from vision model") logger.WithError(err).Error("Failed to get response from vision model")
return "", fmt.Errorf("error getting response from LLM: %w", err) return nil, fmt.Errorf("error getting response from LLM: %w", err)
} }
logger.WithField("content_length", len(completion.Choices[0].Content)).Info("Successfully processed image") result := &OCRResult{
return completion.Choices[0].Content, nil Text: completion.Choices[0].Content,
Metadata: map[string]string{
"provider": p.provider,
"model": p.model,
},
}
logger.WithField("content_length", len(result.Text)).Info("Successfully processed image")
return result, nil
} }
// createOpenAIClient creates a new OpenAI vision model client // createOpenAIClient creates a new OpenAI vision model client

View file

@ -9,9 +9,21 @@ import (
var log = logrus.New() var log = logrus.New()
// OCRResult holds the output from OCR processing
type OCRResult struct {
// Plain text output (required)
Text string
// hOCR output (optional, if provider supports it)
HOCR string
// Additional provider-specific metadata
Metadata map[string]string
}
// Provider defines the interface for OCR processing // Provider defines the interface for OCR processing
type Provider interface { type Provider interface {
ProcessImage(ctx context.Context, imageContent []byte) (string, error) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error)
} }
// Config holds the OCR provider configuration // Config holds the OCR provider configuration
@ -27,6 +39,9 @@ type Config struct {
// LLM settings (from existing config) // LLM settings (from existing config)
VisionLLMProvider string VisionLLMProvider string
VisionLLMModel string VisionLLMModel string
// OCR output options
EnableHOCR bool // Whether to request hOCR output if supported by the provider
} }
// NewProvider creates a new OCR provider based on configuration // NewProvider creates a new OCR provider based on configuration