feat(ocr): enhance OCR processing with structured results and hOCR support

2025-03-13 05:08:01 -05:00 · 2025-02-10 16:31:51 +01:00 · 2025-02-10 16:31:51 +01:00 · d71d340eb5
commit d71d340eb5
parent 18ba388af1
6 changed files with 311 additions and 16 deletions
--- a/cline_docs/systemPatterns.md
+++ b/cline_docs/systemPatterns.md
@ -92,3 +92,88 @@
 - E2E tests for web interface
 - Test fixtures and mocks
 - Playwright for frontend testing
+
+## OCR System Patterns
+
+### OCR Provider Architecture
+
+#### 1. Provider Interface
+- Common interface for all OCR implementations
+- Methods for image processing
+- Configuration through standardized Config struct
+- Resource management patterns
+
+#### 2. LLM Provider Implementation
+- Supports OpenAI and Ollama vision models
+- Base64 encoding for OpenAI requests
+- Binary format for Ollama requests
+- Template-based OCR prompts
+
+#### 3. Google Document AI Provider
+- Enterprise-grade OCR processing
+- MIME type validation
+- Processor configuration via environment
+- Regional endpoint support
+
+### Logging Patterns
+
+#### 1. Provider Initialization
+```
+[INFO] Initializing OCR provider: llm
+[INFO] Using LLM OCR provider (provider=ollama, model=minicpm-v)
+```
+
+#### 2. Processing Logs
+```
+[DEBUG] Starting OCR processing
+[DEBUG] Image dimensions (width=800, height=1200)
+[DEBUG] Using binary image format for non-OpenAI provider
+[DEBUG] Sending request to vision model
+[INFO] Successfully processed image (content_length=1536)
+```
+
+#### 3. Error Logging
+```
+[ERROR] Failed to decode image: invalid format
+[ERROR] Unsupported file type: image/webp
+[ERROR] Failed to get response from vision model
+```
+
+### Error Handling Patterns
+
+#### 1. Configuration Validation
+- Required parameter checks
+- Environment variable validation
+- Provider-specific configuration
+- Connection testing
+
+#### 2. Processing Errors
+- Image format validation
+- MIME type checking
+- Content processing errors
+- Provider-specific error handling
+
+#### 3. Error Propagation
+- Detailed error contexts
+- Original error wrapping
+- Logging with error context
+- Recovery mechanisms
+
+### Processing Flow
+
+#### 1. Document Processing
+```
+Document Tagged → OCR Provider Selected → Image Processing → Text Extraction → Content Update
+```
+
+#### 2. Provider Selection
+```
+Config Check → Provider Initialization → Resource Setup → Provider Ready
+```
+
+#### 3. Error Recovery
+```
+Error Detection → Logging → Cleanup → Error Propagation
+```
+
+These patterns ensure consistent behavior across OCR providers while maintaining proper logging and error handling throughout the system.
--- a/ocr.go
+++ b/ocr.go
@ -36,13 +36,20 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string,
 			return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
 		}

-		ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent)
+		result, err := app.ocrProvider.ProcessImage(ctx, imageContent)
 		if err != nil {
 			return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
 		}
-		pageLogger.Debug("OCR completed for page")
+		if result == nil {
+			pageLogger.Error("Got nil result from OCR provider")
+			return "", fmt.Errorf("error performing OCR for document %d, page %d: nil result", documentID, i+1)
+		}

-		ocrTexts = append(ocrTexts, ocrText)
+		pageLogger.WithField("has_hocr", result.HOCR != "").
+			WithField("metadata", result.Metadata).
+			Debug("OCR completed for page")
+
+		ocrTexts = append(ocrTexts, result.Text)
 	}

 	docLogger.Info("OCR processing completed successfully")
--- a/ocr/google_docai_provider.go
+++ b/ocr/google_docai_provider.go
@ -3,6 +3,7 @@ package ocr
 import (
 	"context"
 	"fmt"
+	"strings"

 	documentai "cloud.google.com/go/documentai/apiv1"
 	"cloud.google.com/go/documentai/apiv1/documentaipb"
@ -46,7 +47,7 @@ func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
 	return provider, nil
 }

-func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
+func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
 	logger := log.WithFields(logrus.Fields{
 		"project_id":   p.projectID,
 		"location":     p.location,
@ -60,7 +61,7 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b

 	if !isImageMIMEType(mtype.String()) {
 		logger.WithField("mime_type", mtype.String()).Error("Unsupported file type")
-		return "", fmt.Errorf("unsupported file type: %s", mtype.String())
+		return nil, fmt.Errorf("unsupported file type: %s", mtype.String())
 	}

 	name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID)
@ -79,21 +80,40 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
 	resp, err := p.client.ProcessDocument(ctx, req)
 	if err != nil {
 		logger.WithError(err).Error("Failed to process document")
-		return "", fmt.Errorf("error processing document: %w", err)
+		return nil, fmt.Errorf("error processing document: %w", err)
 	}

 	if resp == nil || resp.Document == nil {
 		logger.Error("Received nil response or document from Document AI")
-		return "", fmt.Errorf("received nil response or document from Document AI")
+		return nil, fmt.Errorf("received nil response or document from Document AI")
 	}

 	if resp.Document.Error != nil {
 		logger.WithField("error", resp.Document.Error.Message).Error("Document processing error")
-		return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
+		return nil, fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
 	}

-	logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document")
-	return resp.Document.Text, nil
+	result := &OCRResult{
+		Text: resp.Document.Text,
+		Metadata: map[string]string{
+			"provider":     "google_docai",
+			"mime_type":    mtype.String(),
+			"lang_code":    resp.Document.GetPages()[0].GetDetectedLanguages()[0].GetLanguageCode(),
+			"page_count":   fmt.Sprintf("%d", len(resp.Document.GetPages())),
+			"processor_id": p.processorID,
+		},
+	}
+
+	// Add hOCR output if available
+	if len(resp.Document.GetPages()) > 0 {
+		hocr := generateHOCR(resp.Document)
+		if hocr != "" {
+			result.HOCR = hocr
+		}
+	}
+
+	logger.WithField("content_length", len(result.Text)).Info("Successfully processed document")
+	return result, nil
 }

 // isImageMIMEType checks if the given MIME type is a supported image type
@ -109,6 +129,68 @@ func isImageMIMEType(mimeType string) bool {
 	return supportedTypes[mimeType]
 }

+// generateHOCR converts Document AI response to hOCR format
+func generateHOCR(doc *documentaipb.Document) string {
+	if len(doc.GetPages()) == 0 {
+		return ""
+	}
+
+	var hocr strings.Builder
+	hocr.WriteString(`<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+    <title>OCR Output</title>
+    <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
+    <meta name='ocr-system' content='google-docai' />
+</head>
+<body>`)
+
+	for pageNum, page := range doc.GetPages() {
+		pageWidth := page.GetDimension().GetWidth()
+		pageHeight := page.GetDimension().GetHeight()
+
+		hocr.WriteString(fmt.Sprintf(`
+    <div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
+			pageNum+1, int(pageWidth), int(pageHeight)))
+
+		// Process paragraphs
+		for _, para := range page.GetParagraphs() {
+			paraBox := para.GetLayout().GetBoundingPoly().GetNormalizedVertices()
+			if len(paraBox) < 4 {
+				continue
+			}
+
+			// Convert normalized coordinates to absolute
+			x1 := int(paraBox[0].GetX() * pageWidth)
+			y1 := int(paraBox[0].GetY() * pageHeight)
+			x2 := int(paraBox[2].GetX() * pageWidth)
+			y2 := int(paraBox[2].GetY() * pageHeight)
+
+			hocr.WriteString(fmt.Sprintf(`
+        <p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
+				pageNum+1, len(page.GetParagraphs()), x1, y1, x2, y2))
+
+			// Process words within paragraph
+			for _, token := range para.GetLayout().GetTextAnchor().GetTextSegments() {
+				text := doc.Text[token.GetStartIndex():token.GetEndIndex()]
+				if text == "" {
+					continue
+				}
+
+				hocr.WriteString(fmt.Sprintf(`
+            <span class='ocrx_word'>%s</span>`, text))
+			}
+
+			hocr.WriteString("\n        </p>")
+		}
+		hocr.WriteString("\n    </div>")
+	}
+
+	hocr.WriteString("\n</body>\n</html>")
+	return hocr.String()
+}
+
 // Close releases resources used by the provider
 func (p *GoogleDocAIProvider) Close() error {
 	if p.client != nil {
--- a/ocr/google_docai_provider_test.go
+++ b/ocr/google_docai_provider_test.go
@ -0,0 +1,99 @@
+package ocr
+
+import (
+	"context"
+	"regexp"
+	"strings"
+	"testing"
+
+	"cloud.google.com/go/documentai/apiv1/documentaipb"
+)
+
+func TestGenerateHOCR(t *testing.T) {
+	tests := []struct {
+		name     string
+		doc      *documentaipb.Document
+		expected string
+	}{
+		{
+			name:     "empty document",
+			doc:      &documentaipb.Document{},
+			expected: "",
+		},
+		{
+			name: "single page with one paragraph",
+			doc: &documentaipb.Document{
+				Text: "Hello World",
+				Pages: []*documentaipb.Document_Page{
+					{
+						Dimension: &documentaipb.Document_Page_Dimension{
+							Width:  800,
+							Height: 600,
+						},
+						Paragraphs: []*documentaipb.Document_Page_Paragraph{
+							{
+								Layout: &documentaipb.Document_Page_Layout{
+									BoundingPoly: &documentaipb.BoundingPoly{
+										NormalizedVertices: []*documentaipb.NormalizedVertex{
+											{X: 0.1, Y: 0.1},
+											{X: 0.9, Y: 0.1},
+											{X: 0.9, Y: 0.2},
+											{X: 0.1, Y: 0.2},
+										},
+									},
+									TextAnchor: &documentaipb.Document_TextAnchor{
+										TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{
+											{
+												StartIndex: 0,
+												EndIndex:   11,
+											},
+										},
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: "(?s).*<div class='ocr_page' id='page_1' title='image;bbox 0 0 800 600'>.*" +
+				"<p class='ocr_par' id='par_1_1' title='bbox 80 60 720 120'>.*" +
+				"<span class='ocrx_word'>Hello World</span>.*</p>.*</div>.*",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := generateHOCR(tt.doc)
+
+			if tt.expected == "" {
+				if result != "" {
+					t.Errorf("expected empty string, got %v", result)
+				}
+				return
+			}
+
+			matched, err := regexp.MatchString(tt.expected, result)
+			if err != nil {
+				t.Fatalf("error matching regex: %v", err)
+			}
+			if !matched {
+				t.Errorf("expected to match regex %v\ngot: %v", tt.expected, result)
+			}
+
+			// Verify basic hOCR structure
+			if !strings.Contains(result, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") {
+				t.Error("missing XML declaration")
+			}
+			if !strings.Contains(result, "<html xmlns=\"http://www.w3.org/1999/xhtml\"") {
+				t.Error("missing HTML namespace")
+			}
+			if !strings.Contains(result, "<meta name='ocr-system' content='google-docai'") {
+				t.Error("missing OCR system metadata")
+			}
+		})
+	}
+}
+
+func testContext() context.Context {
+	return context.Background()
+}
--- a/ocr/llm_provider.go
+++ b/ocr/llm_provider.go
@ -60,7 +60,7 @@ func newLLMProvider(config Config) (*LLMProvider, error) {
 	}, nil
 }

-func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
+func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
 	logger := log.WithFields(logrus.Fields{
 		"provider": p.provider,
 		"model":    p.model,
@ -71,7 +71,7 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
 	img, _, err := image.Decode(bytes.NewReader(imageContent))
 	if err != nil {
 		logger.WithError(err).Error("Failed to decode image")
-		return "", fmt.Errorf("error decoding image: %w", err)
+		return nil, fmt.Errorf("error decoding image: %w", err)
 	}
 	bounds := img.Bounds()
 	logger.WithFields(logrus.Fields{
@ -106,11 +106,18 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
 	})
 	if err != nil {
 		logger.WithError(err).Error("Failed to get response from vision model")
-		return "", fmt.Errorf("error getting response from LLM: %w", err)
+		return nil, fmt.Errorf("error getting response from LLM: %w", err)
 	}

-	logger.WithField("content_length", len(completion.Choices[0].Content)).Info("Successfully processed image")
-	return completion.Choices[0].Content, nil
+	result := &OCRResult{
+		Text: completion.Choices[0].Content,
+		Metadata: map[string]string{
+			"provider": p.provider,
+			"model":    p.model,
+		},
+	}
+	logger.WithField("content_length", len(result.Text)).Info("Successfully processed image")
+	return result, nil
 }

 // createOpenAIClient creates a new OpenAI vision model client
--- a/ocr/provider.go
+++ b/ocr/provider.go
@ -9,9 +9,21 @@ import (

 var log = logrus.New()

+// OCRResult holds the output from OCR processing
+type OCRResult struct {
+	// Plain text output (required)
+	Text string
+
+	// hOCR output (optional, if provider supports it)
+	HOCR string
+
+	// Additional provider-specific metadata
+	Metadata map[string]string
+}
+
 // Provider defines the interface for OCR processing
 type Provider interface {
-	ProcessImage(ctx context.Context, imageContent []byte) (string, error)
+	ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error)
 }

 // Config holds the OCR provider configuration
@ -27,6 +39,9 @@ type Config struct {
 	// LLM settings (from existing config)
 	VisionLLMProvider string
 	VisionLLMModel    string
+
+	// OCR output options
+	EnableHOCR bool // Whether to request hOCR output if supported by the provider
 }

 // NewProvider creates a new OCR provider based on configuration