feat(ocr): enhance OCR processing with structured results and hOCR support

This commit is contained in:
Dominik Schröter 2025-02-10 16:31:51 +01:00
parent 18ba388af1
commit d71d340eb5
6 changed files with 311 additions and 16 deletions

View file

@ -92,3 +92,88 @@
- E2E tests for web interface
- Test fixtures and mocks
- Playwright for frontend testing
## OCR System Patterns
### OCR Provider Architecture
#### 1. Provider Interface
- Common interface for all OCR implementations
- Methods for image processing
- Configuration through standardized Config struct
- Resource management patterns
#### 2. LLM Provider Implementation
- Supports OpenAI and Ollama vision models
- Base64 encoding for OpenAI requests
- Binary format for Ollama requests
- Template-based OCR prompts
#### 3. Google Document AI Provider
- Enterprise-grade OCR processing
- MIME type validation
- Processor configuration via environment
- Regional endpoint support
### Logging Patterns
#### 1. Provider Initialization
```
[INFO] Initializing OCR provider: llm
[INFO] Using LLM OCR provider (provider=ollama, model=minicpm-v)
```
#### 2. Processing Logs
```
[DEBUG] Starting OCR processing
[DEBUG] Image dimensions (width=800, height=1200)
[DEBUG] Using binary image format for non-OpenAI provider
[DEBUG] Sending request to vision model
[INFO] Successfully processed image (content_length=1536)
```
#### 3. Error Logging
```
[ERROR] Failed to decode image: invalid format
[ERROR] Unsupported file type: image/webp
[ERROR] Failed to get response from vision model
```
### Error Handling Patterns
#### 1. Configuration Validation
- Required parameter checks
- Environment variable validation
- Provider-specific configuration
- Connection testing
#### 2. Processing Errors
- Image format validation
- MIME type checking
- Content processing errors
- Provider-specific error handling
#### 3. Error Propagation
- Detailed error contexts
- Original error wrapping
- Logging with error context
- Recovery mechanisms
### Processing Flow
#### 1. Document Processing
```
Document Tagged → OCR Provider Selected → Image Processing → Text Extraction → Content Update
```
#### 2. Provider Selection
```
Config Check → Provider Initialization → Resource Setup → Provider Ready
```
#### 3. Error Recovery
```
Error Detection → Logging → Cleanup → Error Propagation
```
These patterns ensure consistent behavior across OCR providers while maintaining proper logging and error handling throughout the system.

13
ocr.go
View file

@ -36,13 +36,20 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string,
return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
}
ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent)
result, err := app.ocrProvider.ProcessImage(ctx, imageContent)
if err != nil {
return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
}
pageLogger.Debug("OCR completed for page")
if result == nil {
pageLogger.Error("Got nil result from OCR provider")
return "", fmt.Errorf("error performing OCR for document %d, page %d: nil result", documentID, i+1)
}
ocrTexts = append(ocrTexts, ocrText)
pageLogger.WithField("has_hocr", result.HOCR != "").
WithField("metadata", result.Metadata).
Debug("OCR completed for page")
ocrTexts = append(ocrTexts, result.Text)
}
docLogger.Info("OCR processing completed successfully")

View file

@ -3,6 +3,7 @@ package ocr
import (
"context"
"fmt"
"strings"
documentai "cloud.google.com/go/documentai/apiv1"
"cloud.google.com/go/documentai/apiv1/documentaipb"
@ -46,7 +47,7 @@ func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
return provider, nil
}
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
logger := log.WithFields(logrus.Fields{
"project_id": p.projectID,
"location": p.location,
@ -60,7 +61,7 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
if !isImageMIMEType(mtype.String()) {
logger.WithField("mime_type", mtype.String()).Error("Unsupported file type")
return "", fmt.Errorf("unsupported file type: %s", mtype.String())
return nil, fmt.Errorf("unsupported file type: %s", mtype.String())
}
name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID)
@ -79,21 +80,40 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
resp, err := p.client.ProcessDocument(ctx, req)
if err != nil {
logger.WithError(err).Error("Failed to process document")
return "", fmt.Errorf("error processing document: %w", err)
return nil, fmt.Errorf("error processing document: %w", err)
}
if resp == nil || resp.Document == nil {
logger.Error("Received nil response or document from Document AI")
return "", fmt.Errorf("received nil response or document from Document AI")
return nil, fmt.Errorf("received nil response or document from Document AI")
}
if resp.Document.Error != nil {
logger.WithField("error", resp.Document.Error.Message).Error("Document processing error")
return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
return nil, fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
}
logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document")
return resp.Document.Text, nil
result := &OCRResult{
Text: resp.Document.Text,
Metadata: map[string]string{
"provider": "google_docai",
"mime_type": mtype.String(),
"lang_code": resp.Document.GetPages()[0].GetDetectedLanguages()[0].GetLanguageCode(),
"page_count": fmt.Sprintf("%d", len(resp.Document.GetPages())),
"processor_id": p.processorID,
},
}
// Add hOCR output if available
if len(resp.Document.GetPages()) > 0 {
hocr := generateHOCR(resp.Document)
if hocr != "" {
result.HOCR = hocr
}
}
logger.WithField("content_length", len(result.Text)).Info("Successfully processed document")
return result, nil
}
// isImageMIMEType checks if the given MIME type is a supported image type
@ -109,6 +129,68 @@ func isImageMIMEType(mimeType string) bool {
return supportedTypes[mimeType]
}
// generateHOCR converts Document AI response to hOCR format
func generateHOCR(doc *documentaipb.Document) string {
if len(doc.GetPages()) == 0 {
return ""
}
var hocr strings.Builder
hocr.WriteString(`<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>OCR Output</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='google-docai' />
</head>
<body>`)
for pageNum, page := range doc.GetPages() {
pageWidth := page.GetDimension().GetWidth()
pageHeight := page.GetDimension().GetHeight()
hocr.WriteString(fmt.Sprintf(`
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
pageNum+1, int(pageWidth), int(pageHeight)))
// Process paragraphs
for _, para := range page.GetParagraphs() {
paraBox := para.GetLayout().GetBoundingPoly().GetNormalizedVertices()
if len(paraBox) < 4 {
continue
}
// Convert normalized coordinates to absolute
x1 := int(paraBox[0].GetX() * pageWidth)
y1 := int(paraBox[0].GetY() * pageHeight)
x2 := int(paraBox[2].GetX() * pageWidth)
y2 := int(paraBox[2].GetY() * pageHeight)
hocr.WriteString(fmt.Sprintf(`
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
pageNum+1, len(page.GetParagraphs()), x1, y1, x2, y2))
// Process words within paragraph
for _, token := range para.GetLayout().GetTextAnchor().GetTextSegments() {
text := doc.Text[token.GetStartIndex():token.GetEndIndex()]
if text == "" {
continue
}
hocr.WriteString(fmt.Sprintf(`
<span class='ocrx_word'>%s</span>`, text))
}
hocr.WriteString("\n </p>")
}
hocr.WriteString("\n </div>")
}
hocr.WriteString("\n</body>\n</html>")
return hocr.String()
}
// Close releases resources used by the provider
func (p *GoogleDocAIProvider) Close() error {
if p.client != nil {

View file

@ -0,0 +1,99 @@
package ocr
import (
"context"
"regexp"
"strings"
"testing"
"cloud.google.com/go/documentai/apiv1/documentaipb"
)
func TestGenerateHOCR(t *testing.T) {
tests := []struct {
name string
doc *documentaipb.Document
expected string
}{
{
name: "empty document",
doc: &documentaipb.Document{},
expected: "",
},
{
name: "single page with one paragraph",
doc: &documentaipb.Document{
Text: "Hello World",
Pages: []*documentaipb.Document_Page{
{
Dimension: &documentaipb.Document_Page_Dimension{
Width: 800,
Height: 600,
},
Paragraphs: []*documentaipb.Document_Page_Paragraph{
{
Layout: &documentaipb.Document_Page_Layout{
BoundingPoly: &documentaipb.BoundingPoly{
NormalizedVertices: []*documentaipb.NormalizedVertex{
{X: 0.1, Y: 0.1},
{X: 0.9, Y: 0.1},
{X: 0.9, Y: 0.2},
{X: 0.1, Y: 0.2},
},
},
TextAnchor: &documentaipb.Document_TextAnchor{
TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{
{
StartIndex: 0,
EndIndex: 11,
},
},
},
},
},
},
},
},
},
expected: "(?s).*<div class='ocr_page' id='page_1' title='image;bbox 0 0 800 600'>.*" +
"<p class='ocr_par' id='par_1_1' title='bbox 80 60 720 120'>.*" +
"<span class='ocrx_word'>Hello World</span>.*</p>.*</div>.*",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := generateHOCR(tt.doc)
if tt.expected == "" {
if result != "" {
t.Errorf("expected empty string, got %v", result)
}
return
}
matched, err := regexp.MatchString(tt.expected, result)
if err != nil {
t.Fatalf("error matching regex: %v", err)
}
if !matched {
t.Errorf("expected to match regex %v\ngot: %v", tt.expected, result)
}
// Verify basic hOCR structure
if !strings.Contains(result, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") {
t.Error("missing XML declaration")
}
if !strings.Contains(result, "<html xmlns=\"http://www.w3.org/1999/xhtml\"") {
t.Error("missing HTML namespace")
}
if !strings.Contains(result, "<meta name='ocr-system' content='google-docai'") {
t.Error("missing OCR system metadata")
}
})
}
}
func testContext() context.Context {
return context.Background()
}

View file

@ -60,7 +60,7 @@ func newLLMProvider(config Config) (*LLMProvider, error) {
}, nil
}
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
logger := log.WithFields(logrus.Fields{
"provider": p.provider,
"model": p.model,
@ -71,7 +71,7 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
img, _, err := image.Decode(bytes.NewReader(imageContent))
if err != nil {
logger.WithError(err).Error("Failed to decode image")
return "", fmt.Errorf("error decoding image: %w", err)
return nil, fmt.Errorf("error decoding image: %w", err)
}
bounds := img.Bounds()
logger.WithFields(logrus.Fields{
@ -106,11 +106,18 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
})
if err != nil {
logger.WithError(err).Error("Failed to get response from vision model")
return "", fmt.Errorf("error getting response from LLM: %w", err)
return nil, fmt.Errorf("error getting response from LLM: %w", err)
}
logger.WithField("content_length", len(completion.Choices[0].Content)).Info("Successfully processed image")
return completion.Choices[0].Content, nil
result := &OCRResult{
Text: completion.Choices[0].Content,
Metadata: map[string]string{
"provider": p.provider,
"model": p.model,
},
}
logger.WithField("content_length", len(result.Text)).Info("Successfully processed image")
return result, nil
}
// createOpenAIClient creates a new OpenAI vision model client

View file

@ -9,9 +9,21 @@ import (
var log = logrus.New()
// OCRResult holds the output from OCR processing
type OCRResult struct {
// Plain text output (required)
Text string
// hOCR output (optional, if provider supports it)
HOCR string
// Additional provider-specific metadata
Metadata map[string]string
}
// Provider defines the interface for OCR processing
type Provider interface {
ProcessImage(ctx context.Context, imageContent []byte) (string, error)
ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error)
}
// Config holds the OCR provider configuration
@ -27,6 +39,9 @@ type Config struct {
// LLM settings (from existing config)
VisionLLMProvider string
VisionLLMModel string
// OCR output options
EnableHOCR bool // Whether to request hOCR output if supported by the provider
}
// NewProvider creates a new OCR provider based on configuration