feat(ocr): enhance OCR processing with structured results and hOCR support

2025-03-13 05:08:01 -05:00 · 2025-02-10 16:31:51 +01:00 · 2025-02-10 16:31:51 +01:00 · d71d340eb5
commit d71d340eb5
parent 18ba388af1
6 changed files with 311 additions and 16 deletions
--- a/cline_docs/systemPatterns.md
+++ b/cline_docs/systemPatterns.md
@ -92,3 +92,88 @@
 - E2E tests for web interface
 - Test fixtures and mocks
 - Playwright for frontend testing
 ## OCR System Patterns
 ### OCR Provider Architecture
 #### 1. Provider Interface
 - Common interface for all OCR implementations
 - Methods for image processing
 - Configuration through standardized Config struct
 - Resource management patterns
 #### 2. LLM Provider Implementation
 - Supports OpenAI and Ollama vision models
 - Base64 encoding for OpenAI requests
 - Binary format for Ollama requests
 - Template-based OCR prompts
 #### 3. Google Document AI Provider
 - Enterprise-grade OCR processing
 - MIME type validation
 - Processor configuration via environment
 - Regional endpoint support
 ### Logging Patterns
 #### 1. Provider Initialization
 ```
 [INFO] Initializing OCR provider: llm
 [INFO] Using LLM OCR provider (provider=ollama, model=minicpm-v)
 ```
 #### 2. Processing Logs
 ```
 [DEBUG] Starting OCR processing
 [DEBUG] Image dimensions (width=800, height=1200)
 [DEBUG] Using binary image format for non-OpenAI provider
 [DEBUG] Sending request to vision model
 [INFO] Successfully processed image (content_length=1536)
 ```
 #### 3. Error Logging
 ```
 [ERROR] Failed to decode image: invalid format
 [ERROR] Unsupported file type: image/webp
 [ERROR] Failed to get response from vision model
 ```
 ### Error Handling Patterns
 #### 1. Configuration Validation
 - Required parameter checks
 - Environment variable validation
 - Provider-specific configuration
 - Connection testing
 #### 2. Processing Errors
 - Image format validation
 - MIME type checking
 - Content processing errors
 - Provider-specific error handling
 #### 3. Error Propagation
 - Detailed error contexts
 - Original error wrapping
 - Logging with error context
 - Recovery mechanisms
 ### Processing Flow
 #### 1. Document Processing
 ```
 Document Tagged → OCR Provider Selected → Image Processing → Text Extraction → Content Update
 ```
 #### 2. Provider Selection
 ```
 Config Check → Provider Initialization → Resource Setup → Provider Ready
 ```
 #### 3. Error Recovery
 ```
 Error Detection → Logging → Cleanup → Error Propagation
 ```
 These patterns ensure consistent behavior across OCR providers while maintaining proper logging and error handling throughout the system.
--- a/ocr.go
+++ b/ocr.go
@ -36,13 +36,20 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string,
 			return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
 		}
-		ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent)
+		result, err := app.ocrProvider.ProcessImage(ctx, imageContent)
 		if err != nil {
 			return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
 		}
-		pageLogger.Debug("OCR completed for page")
+		if result == nil {
 			pageLogger.Error("Got nil result from OCR provider")
 			return "", fmt.Errorf("error performing OCR for document %d, page %d: nil result", documentID, i+1)
 		}
-		ocrTexts = append(ocrTexts, ocrText)
+		pageLogger.WithField("has_hocr", result.HOCR != "").
 			WithField("metadata", result.Metadata).
 			Debug("OCR completed for page")
 		ocrTexts = append(ocrTexts, result.Text)
 	}
 	docLogger.Info("OCR processing completed successfully")
--- a/ocr/google_docai_provider.go
+++ b/ocr/google_docai_provider.go
@ -3,6 +3,7 @@ package ocr
 import (
 	"context"
 	"fmt"
 	"strings"
 	documentai "cloud.google.com/go/documentai/apiv1"
 	"cloud.google.com/go/documentai/apiv1/documentaipb"
@ -46,7 +47,7 @@ func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
 	return provider, nil
 }
-func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
+func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
 	logger := log.WithFields(logrus.Fields{
 		"project_id":   p.projectID,
 		"location":     p.location,
@ -60,7 +61,7 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
 	if !isImageMIMEType(mtype.String()) {
 		logger.WithField("mime_type", mtype.String()).Error("Unsupported file type")
-		return "", fmt.Errorf("unsupported file type: %s", mtype.String())
+		return nil, fmt.Errorf("unsupported file type: %s", mtype.String())
 	}
 	name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID)
@ -79,21 +80,40 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
 	resp, err := p.client.ProcessDocument(ctx, req)
 	if err != nil {
 		logger.WithError(err).Error("Failed to process document")
-		return "", fmt.Errorf("error processing document: %w", err)
+		return nil, fmt.Errorf("error processing document: %w", err)
 	}
 	if resp == nil || resp.Document == nil {
 		logger.Error("Received nil response or document from Document AI")
-		return "", fmt.Errorf("received nil response or document from Document AI")
+		return nil, fmt.Errorf("received nil response or document from Document AI")
 	}
 	if resp.Document.Error != nil {
 		logger.WithField("error", resp.Document.Error.Message).Error("Document processing error")
-		return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
+		return nil, fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
 	}
-	logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document")
+	result := &OCRResult{
-	return resp.Document.Text, nil
+		Text: resp.Document.Text,
 		Metadata: map[string]string{
 			"provider":     "google_docai",
 			"mime_type":    mtype.String(),
 			"lang_code":    resp.Document.GetPages()[0].GetDetectedLanguages()[0].GetLanguageCode(),
 			"page_count":   fmt.Sprintf("%d", len(resp.Document.GetPages())),
 			"processor_id": p.processorID,
 		},
 	}
 	// Add hOCR output if available
 	if len(resp.Document.GetPages()) > 0 {
 		hocr := generateHOCR(resp.Document)
 		if hocr != "" {
 			result.HOCR = hocr
 		}
 	}
 	logger.WithField("content_length", len(result.Text)).Info("Successfully processed document")
 	return result, nil
 }
 // isImageMIMEType checks if the given MIME type is a supported image type
@ -109,6 +129,68 @@ func isImageMIMEType(mimeType string) bool {
 	return supportedTypes[mimeType]
 }
 // generateHOCR converts Document AI response to hOCR format
 func generateHOCR(doc *documentaipb.Document) string {
 	if len(doc.GetPages()) == 0 {
 		return ""
 	}
 	var hocr strings.Builder
 	hocr.WriteString(`<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
    <title>OCR Output</title>
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
    <meta name='ocr-system' content='google-docai' />
 </head>
 <body>`)
 	for pageNum, page := range doc.GetPages() {
 		pageWidth := page.GetDimension().GetWidth()
 		pageHeight := page.GetDimension().GetHeight()
 		hocr.WriteString(fmt.Sprintf(`
    <div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
 			pageNum+1, int(pageWidth), int(pageHeight)))
 		// Process paragraphs
 		for _, para := range page.GetParagraphs() {
 			paraBox := para.GetLayout().GetBoundingPoly().GetNormalizedVertices()
 			if len(paraBox) < 4 {
 				continue
 			}
 			// Convert normalized coordinates to absolute
 			x1 := int(paraBox[0].GetX() * pageWidth)
 			y1 := int(paraBox[0].GetY() * pageHeight)
 			x2 := int(paraBox[2].GetX() * pageWidth)
 			y2 := int(paraBox[2].GetY() * pageHeight)
 			hocr.WriteString(fmt.Sprintf(`
        <p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
 				pageNum+1, len(page.GetParagraphs()), x1, y1, x2, y2))
 			// Process words within paragraph
 			for _, token := range para.GetLayout().GetTextAnchor().GetTextSegments() {
 				text := doc.Text[token.GetStartIndex():token.GetEndIndex()]
 				if text == "" {
 					continue
 				}
 				hocr.WriteString(fmt.Sprintf(`
            <span class='ocrx_word'>%s</span>`, text))
 			}
 			hocr.WriteString("\n        </p>")
 		}
 		hocr.WriteString("\n    </div>")
 	}
 	hocr.WriteString("\n</body>\n</html>")
 	return hocr.String()
 }
 // Close releases resources used by the provider
 func (p *GoogleDocAIProvider) Close() error {
 	if p.client != nil {
--- a/ocr/google_docai_provider_test.go
+++ b/ocr/google_docai_provider_test.go
@ -0,0 +1,99 @@
 package ocr
 import (
 	"context"
 	"regexp"
 	"strings"
 	"testing"
 	"cloud.google.com/go/documentai/apiv1/documentaipb"
 )
 func TestGenerateHOCR(t *testing.T) {
 	tests := []struct {
 		name     string
 		doc      *documentaipb.Document
 		expected string
 	}{
 		{
 			name:     "empty document",
 			doc:      &documentaipb.Document{},
 			expected: "",
 		},
 		{
 			name: "single page with one paragraph",
 			doc: &documentaipb.Document{
 				Text: "Hello World",
 				Pages: []*documentaipb.Document_Page{
 					{
 						Dimension: &documentaipb.Document_Page_Dimension{
 							Width:  800,
 							Height: 600,
 						},
 						Paragraphs: []*documentaipb.Document_Page_Paragraph{
 							{
 								Layout: &documentaipb.Document_Page_Layout{
 									BoundingPoly: &documentaipb.BoundingPoly{
 										NormalizedVertices: []*documentaipb.NormalizedVertex{
 											{X: 0.1, Y: 0.1},
 											{X: 0.9, Y: 0.1},
 											{X: 0.9, Y: 0.2},
 											{X: 0.1, Y: 0.2},
 										},
 									},
 									TextAnchor: &documentaipb.Document_TextAnchor{
 										TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{
 											{
 												StartIndex: 0,
 												EndIndex:   11,
 											},
 										},
 									},
 								},
 							},
 						},
 					},
 				},
 			},
 			expected: "(?s).*<div class='ocr_page' id='page_1' title='image;bbox 0 0 800 600'>.*" +
 				"<p class='ocr_par' id='par_1_1' title='bbox 80 60 720 120'>.*" +
 				"<span class='ocrx_word'>Hello World</span>.*</p>.*</div>.*",
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := generateHOCR(tt.doc)
 			if tt.expected == "" {
 				if result != "" {
 					t.Errorf("expected empty string, got %v", result)
 				}
 				return
 			}
 			matched, err := regexp.MatchString(tt.expected, result)
 			if err != nil {
 				t.Fatalf("error matching regex: %v", err)
 			}
 			if !matched {
 				t.Errorf("expected to match regex %v\ngot: %v", tt.expected, result)
 			}
 			// Verify basic hOCR structure
 			if !strings.Contains(result, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") {
 				t.Error("missing XML declaration")
 			}
 			if !strings.Contains(result, "<html xmlns=\"http://www.w3.org/1999/xhtml\"") {
 				t.Error("missing HTML namespace")
 			}
 			if !strings.Contains(result, "<meta name='ocr-system' content='google-docai'") {
 				t.Error("missing OCR system metadata")
 			}
 		})
 	}
 }
 func testContext() context.Context {
 	return context.Background()
 }
--- a/ocr/llm_provider.go
+++ b/ocr/llm_provider.go
@ -60,7 +60,7 @@ func newLLMProvider(config Config) (*LLMProvider, error) {
 	}, nil
 }
-func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
+func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
 	logger := log.WithFields(logrus.Fields{
 		"provider": p.provider,
 		"model":    p.model,
@ -71,7 +71,7 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
 	img, _, err := image.Decode(bytes.NewReader(imageContent))
 	if err != nil {
 		logger.WithError(err).Error("Failed to decode image")
-		return "", fmt.Errorf("error decoding image: %w", err)
+		return nil, fmt.Errorf("error decoding image: %w", err)
 	}
 	bounds := img.Bounds()
 	logger.WithFields(logrus.Fields{
@ -106,11 +106,18 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
 	})
 	if err != nil {
 		logger.WithError(err).Error("Failed to get response from vision model")
-		return "", fmt.Errorf("error getting response from LLM: %w", err)
+		return nil, fmt.Errorf("error getting response from LLM: %w", err)
 	}
-	logger.WithField("content_length", len(completion.Choices[0].Content)).Info("Successfully processed image")
+	result := &OCRResult{
-	return completion.Choices[0].Content, nil
+		Text: completion.Choices[0].Content,
 		Metadata: map[string]string{
 			"provider": p.provider,
 			"model":    p.model,
 		},
 	}
 	logger.WithField("content_length", len(result.Text)).Info("Successfully processed image")
 	return result, nil
 }
 // createOpenAIClient creates a new OpenAI vision model client
--- a/ocr/provider.go
+++ b/ocr/provider.go
@ -9,9 +9,21 @@ import (
 var log = logrus.New()
 // OCRResult holds the output from OCR processing
 type OCRResult struct {
 	// Plain text output (required)
 	Text string
 	// hOCR output (optional, if provider supports it)
 	HOCR string
 	// Additional provider-specific metadata
 	Metadata map[string]string
 }
 // Provider defines the interface for OCR processing
 type Provider interface {
-	ProcessImage(ctx context.Context, imageContent []byte) (string, error)
+	ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error)
 }
 // Config holds the OCR provider configuration
@ -27,6 +39,9 @@ type Config struct {
 	// LLM settings (from existing config)
 	VisionLLMProvider string
 	VisionLLMModel    string
 	// OCR output options
 	EnableHOCR bool // Whether to request hOCR output if supported by the provider
 }
 // NewProvider creates a new OCR provider based on configuration