From d71d340eb50d3f12e62799cf45896f499db6aea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Schr=C3=B6ter?= Date: Mon, 10 Feb 2025 16:31:51 +0100 Subject: [PATCH] feat(ocr): enhance OCR processing with structured results and hOCR support --- cline_docs/systemPatterns.md | 85 ++++++++++++++++++++++++++ ocr.go | 13 +++- ocr/google_docai_provider.go | 96 +++++++++++++++++++++++++++--- ocr/google_docai_provider_test.go | 99 +++++++++++++++++++++++++++++++ ocr/llm_provider.go | 17 ++++-- ocr/provider.go | 17 +++++- 6 files changed, 311 insertions(+), 16 deletions(-) create mode 100644 ocr/google_docai_provider_test.go diff --git a/cline_docs/systemPatterns.md b/cline_docs/systemPatterns.md index c3225f8..54f4573 100644 --- a/cline_docs/systemPatterns.md +++ b/cline_docs/systemPatterns.md @@ -92,3 +92,88 @@ - E2E tests for web interface - Test fixtures and mocks - Playwright for frontend testing + +## OCR System Patterns + +### OCR Provider Architecture + +#### 1. Provider Interface +- Common interface for all OCR implementations +- Methods for image processing +- Configuration through standardized Config struct +- Resource management patterns + +#### 2. LLM Provider Implementation +- Supports OpenAI and Ollama vision models +- Base64 encoding for OpenAI requests +- Binary format for Ollama requests +- Template-based OCR prompts + +#### 3. Google Document AI Provider +- Enterprise-grade OCR processing +- MIME type validation +- Processor configuration via environment +- Regional endpoint support + +### Logging Patterns + +#### 1. Provider Initialization +``` +[INFO] Initializing OCR provider: llm +[INFO] Using LLM OCR provider (provider=ollama, model=minicpm-v) +``` + +#### 2. Processing Logs +``` +[DEBUG] Starting OCR processing +[DEBUG] Image dimensions (width=800, height=1200) +[DEBUG] Using binary image format for non-OpenAI provider +[DEBUG] Sending request to vision model +[INFO] Successfully processed image (content_length=1536) +``` + +#### 3. Error Logging +``` +[ERROR] Failed to decode image: invalid format +[ERROR] Unsupported file type: image/webp +[ERROR] Failed to get response from vision model +``` + +### Error Handling Patterns + +#### 1. Configuration Validation +- Required parameter checks +- Environment variable validation +- Provider-specific configuration +- Connection testing + +#### 2. Processing Errors +- Image format validation +- MIME type checking +- Content processing errors +- Provider-specific error handling + +#### 3. Error Propagation +- Detailed error contexts +- Original error wrapping +- Logging with error context +- Recovery mechanisms + +### Processing Flow + +#### 1. Document Processing +``` +Document Tagged → OCR Provider Selected → Image Processing → Text Extraction → Content Update +``` + +#### 2. Provider Selection +``` +Config Check → Provider Initialization → Resource Setup → Provider Ready +``` + +#### 3. Error Recovery +``` +Error Detection → Logging → Cleanup → Error Propagation +``` + +These patterns ensure consistent behavior across OCR providers while maintaining proper logging and error handling throughout the system. diff --git a/ocr.go b/ocr.go index 9d843ec..c6e82b1 100644 --- a/ocr.go +++ b/ocr.go @@ -36,13 +36,20 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err) } - ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent) + result, err := app.ocrProvider.ProcessImage(ctx, imageContent) if err != nil { return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err) } - pageLogger.Debug("OCR completed for page") + if result == nil { + pageLogger.Error("Got nil result from OCR provider") + return "", fmt.Errorf("error performing OCR for document %d, page %d: nil result", documentID, i+1) + } - ocrTexts = append(ocrTexts, ocrText) + pageLogger.WithField("has_hocr", result.HOCR != ""). + WithField("metadata", result.Metadata). + Debug("OCR completed for page") + + ocrTexts = append(ocrTexts, result.Text) } docLogger.Info("OCR processing completed successfully") diff --git a/ocr/google_docai_provider.go b/ocr/google_docai_provider.go index 265797f..1ed8ae9 100644 --- a/ocr/google_docai_provider.go +++ b/ocr/google_docai_provider.go @@ -3,6 +3,7 @@ package ocr import ( "context" "fmt" + "strings" documentai "cloud.google.com/go/documentai/apiv1" "cloud.google.com/go/documentai/apiv1/documentaipb" @@ -46,7 +47,7 @@ func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) { return provider, nil } -func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) { +func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) { logger := log.WithFields(logrus.Fields{ "project_id": p.projectID, "location": p.location, @@ -60,7 +61,7 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b if !isImageMIMEType(mtype.String()) { logger.WithField("mime_type", mtype.String()).Error("Unsupported file type") - return "", fmt.Errorf("unsupported file type: %s", mtype.String()) + return nil, fmt.Errorf("unsupported file type: %s", mtype.String()) } name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID) @@ -79,21 +80,40 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b resp, err := p.client.ProcessDocument(ctx, req) if err != nil { logger.WithError(err).Error("Failed to process document") - return "", fmt.Errorf("error processing document: %w", err) + return nil, fmt.Errorf("error processing document: %w", err) } if resp == nil || resp.Document == nil { logger.Error("Received nil response or document from Document AI") - return "", fmt.Errorf("received nil response or document from Document AI") + return nil, fmt.Errorf("received nil response or document from Document AI") } if resp.Document.Error != nil { logger.WithField("error", resp.Document.Error.Message).Error("Document processing error") - return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message) + return nil, fmt.Errorf("document processing error: %s", resp.Document.Error.Message) } - logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document") - return resp.Document.Text, nil + result := &OCRResult{ + Text: resp.Document.Text, + Metadata: map[string]string{ + "provider": "google_docai", + "mime_type": mtype.String(), + "lang_code": resp.Document.GetPages()[0].GetDetectedLanguages()[0].GetLanguageCode(), + "page_count": fmt.Sprintf("%d", len(resp.Document.GetPages())), + "processor_id": p.processorID, + }, + } + + // Add hOCR output if available + if len(resp.Document.GetPages()) > 0 { + hocr := generateHOCR(resp.Document) + if hocr != "" { + result.HOCR = hocr + } + } + + logger.WithField("content_length", len(result.Text)).Info("Successfully processed document") + return result, nil } // isImageMIMEType checks if the given MIME type is a supported image type @@ -109,6 +129,68 @@ func isImageMIMEType(mimeType string) bool { return supportedTypes[mimeType] } +// generateHOCR converts Document AI response to hOCR format +func generateHOCR(doc *documentaipb.Document) string { + if len(doc.GetPages()) == 0 { + return "" + } + + var hocr strings.Builder + hocr.WriteString(` + + + + OCR Output + + + +`) + + for pageNum, page := range doc.GetPages() { + pageWidth := page.GetDimension().GetWidth() + pageHeight := page.GetDimension().GetHeight() + + hocr.WriteString(fmt.Sprintf(` +
`, + pageNum+1, int(pageWidth), int(pageHeight))) + + // Process paragraphs + for _, para := range page.GetParagraphs() { + paraBox := para.GetLayout().GetBoundingPoly().GetNormalizedVertices() + if len(paraBox) < 4 { + continue + } + + // Convert normalized coordinates to absolute + x1 := int(paraBox[0].GetX() * pageWidth) + y1 := int(paraBox[0].GetY() * pageHeight) + x2 := int(paraBox[2].GetX() * pageWidth) + y2 := int(paraBox[2].GetY() * pageHeight) + + hocr.WriteString(fmt.Sprintf(` +

`, + pageNum+1, len(page.GetParagraphs()), x1, y1, x2, y2)) + + // Process words within paragraph + for _, token := range para.GetLayout().GetTextAnchor().GetTextSegments() { + text := doc.Text[token.GetStartIndex():token.GetEndIndex()] + if text == "" { + continue + } + + hocr.WriteString(fmt.Sprintf(` + %s`, text)) + } + + hocr.WriteString("\n

") + } + hocr.WriteString("\n
") + } + + hocr.WriteString("\n\n") + return hocr.String() +} + // Close releases resources used by the provider func (p *GoogleDocAIProvider) Close() error { if p.client != nil { diff --git a/ocr/google_docai_provider_test.go b/ocr/google_docai_provider_test.go new file mode 100644 index 0000000..0f16afd --- /dev/null +++ b/ocr/google_docai_provider_test.go @@ -0,0 +1,99 @@ +package ocr + +import ( + "context" + "regexp" + "strings" + "testing" + + "cloud.google.com/go/documentai/apiv1/documentaipb" +) + +func TestGenerateHOCR(t *testing.T) { + tests := []struct { + name string + doc *documentaipb.Document + expected string + }{ + { + name: "empty document", + doc: &documentaipb.Document{}, + expected: "", + }, + { + name: "single page with one paragraph", + doc: &documentaipb.Document{ + Text: "Hello World", + Pages: []*documentaipb.Document_Page{ + { + Dimension: &documentaipb.Document_Page_Dimension{ + Width: 800, + Height: 600, + }, + Paragraphs: []*documentaipb.Document_Page_Paragraph{ + { + Layout: &documentaipb.Document_Page_Layout{ + BoundingPoly: &documentaipb.BoundingPoly{ + NormalizedVertices: []*documentaipb.NormalizedVertex{ + {X: 0.1, Y: 0.1}, + {X: 0.9, Y: 0.1}, + {X: 0.9, Y: 0.2}, + {X: 0.1, Y: 0.2}, + }, + }, + TextAnchor: &documentaipb.Document_TextAnchor{ + TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{ + { + StartIndex: 0, + EndIndex: 11, + }, + }, + }, + }, + }, + }, + }, + }, + }, + expected: "(?s).*
.*" + + "

.*" + + "Hello World.*

.*
.*", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := generateHOCR(tt.doc) + + if tt.expected == "" { + if result != "" { + t.Errorf("expected empty string, got %v", result) + } + return + } + + matched, err := regexp.MatchString(tt.expected, result) + if err != nil { + t.Fatalf("error matching regex: %v", err) + } + if !matched { + t.Errorf("expected to match regex %v\ngot: %v", tt.expected, result) + } + + // Verify basic hOCR structure + if !strings.Contains(result, "") { + t.Error("missing XML declaration") + } + if !strings.Contains(result, "