diff --git a/cline_docs/systemPatterns.md b/cline_docs/systemPatterns.md index c3225f8..54f4573 100644 --- a/cline_docs/systemPatterns.md +++ b/cline_docs/systemPatterns.md @@ -92,3 +92,88 @@ - E2E tests for web interface - Test fixtures and mocks - Playwright for frontend testing + +## OCR System Patterns + +### OCR Provider Architecture + +#### 1. Provider Interface +- Common interface for all OCR implementations +- Methods for image processing +- Configuration through standardized Config struct +- Resource management patterns + +#### 2. LLM Provider Implementation +- Supports OpenAI and Ollama vision models +- Base64 encoding for OpenAI requests +- Binary format for Ollama requests +- Template-based OCR prompts + +#### 3. Google Document AI Provider +- Enterprise-grade OCR processing +- MIME type validation +- Processor configuration via environment +- Regional endpoint support + +### Logging Patterns + +#### 1. Provider Initialization +``` +[INFO] Initializing OCR provider: llm +[INFO] Using LLM OCR provider (provider=ollama, model=minicpm-v) +``` + +#### 2. Processing Logs +``` +[DEBUG] Starting OCR processing +[DEBUG] Image dimensions (width=800, height=1200) +[DEBUG] Using binary image format for non-OpenAI provider +[DEBUG] Sending request to vision model +[INFO] Successfully processed image (content_length=1536) +``` + +#### 3. Error Logging +``` +[ERROR] Failed to decode image: invalid format +[ERROR] Unsupported file type: image/webp +[ERROR] Failed to get response from vision model +``` + +### Error Handling Patterns + +#### 1. Configuration Validation +- Required parameter checks +- Environment variable validation +- Provider-specific configuration +- Connection testing + +#### 2. Processing Errors +- Image format validation +- MIME type checking +- Content processing errors +- Provider-specific error handling + +#### 3. Error Propagation +- Detailed error contexts +- Original error wrapping +- Logging with error context +- Recovery mechanisms + +### Processing Flow + +#### 1. Document Processing +``` +Document Tagged → OCR Provider Selected → Image Processing → Text Extraction → Content Update +``` + +#### 2. Provider Selection +``` +Config Check → Provider Initialization → Resource Setup → Provider Ready +``` + +#### 3. Error Recovery +``` +Error Detection → Logging → Cleanup → Error Propagation +``` + +These patterns ensure consistent behavior across OCR providers while maintaining proper logging and error handling throughout the system. diff --git a/ocr.go b/ocr.go index 9d843ec..c6e82b1 100644 --- a/ocr.go +++ b/ocr.go @@ -36,13 +36,20 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err) } - ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent) + result, err := app.ocrProvider.ProcessImage(ctx, imageContent) if err != nil { return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err) } - pageLogger.Debug("OCR completed for page") + if result == nil { + pageLogger.Error("Got nil result from OCR provider") + return "", fmt.Errorf("error performing OCR for document %d, page %d: nil result", documentID, i+1) + } - ocrTexts = append(ocrTexts, ocrText) + pageLogger.WithField("has_hocr", result.HOCR != ""). + WithField("metadata", result.Metadata). + Debug("OCR completed for page") + + ocrTexts = append(ocrTexts, result.Text) } docLogger.Info("OCR processing completed successfully") diff --git a/ocr/google_docai_provider.go b/ocr/google_docai_provider.go index 265797f..1ed8ae9 100644 --- a/ocr/google_docai_provider.go +++ b/ocr/google_docai_provider.go @@ -3,6 +3,7 @@ package ocr import ( "context" "fmt" + "strings" documentai "cloud.google.com/go/documentai/apiv1" "cloud.google.com/go/documentai/apiv1/documentaipb" @@ -46,7 +47,7 @@ func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) { return provider, nil } -func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) { +func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) { logger := log.WithFields(logrus.Fields{ "project_id": p.projectID, "location": p.location, @@ -60,7 +61,7 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b if !isImageMIMEType(mtype.String()) { logger.WithField("mime_type", mtype.String()).Error("Unsupported file type") - return "", fmt.Errorf("unsupported file type: %s", mtype.String()) + return nil, fmt.Errorf("unsupported file type: %s", mtype.String()) } name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID) @@ -79,21 +80,40 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b resp, err := p.client.ProcessDocument(ctx, req) if err != nil { logger.WithError(err).Error("Failed to process document") - return "", fmt.Errorf("error processing document: %w", err) + return nil, fmt.Errorf("error processing document: %w", err) } if resp == nil || resp.Document == nil { logger.Error("Received nil response or document from Document AI") - return "", fmt.Errorf("received nil response or document from Document AI") + return nil, fmt.Errorf("received nil response or document from Document AI") } if resp.Document.Error != nil { logger.WithField("error", resp.Document.Error.Message).Error("Document processing error") - return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message) + return nil, fmt.Errorf("document processing error: %s", resp.Document.Error.Message) } - logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document") - return resp.Document.Text, nil + result := &OCRResult{ + Text: resp.Document.Text, + Metadata: map[string]string{ + "provider": "google_docai", + "mime_type": mtype.String(), + "lang_code": resp.Document.GetPages()[0].GetDetectedLanguages()[0].GetLanguageCode(), + "page_count": fmt.Sprintf("%d", len(resp.Document.GetPages())), + "processor_id": p.processorID, + }, + } + + // Add hOCR output if available + if len(resp.Document.GetPages()) > 0 { + hocr := generateHOCR(resp.Document) + if hocr != "" { + result.HOCR = hocr + } + } + + logger.WithField("content_length", len(result.Text)).Info("Successfully processed document") + return result, nil } // isImageMIMEType checks if the given MIME type is a supported image type @@ -109,6 +129,68 @@ func isImageMIMEType(mimeType string) bool { return supportedTypes[mimeType] } +// generateHOCR converts Document AI response to hOCR format +func generateHOCR(doc *documentaipb.Document) string { + if len(doc.GetPages()) == 0 { + return "" + } + + var hocr strings.Builder + hocr.WriteString(` + + +
+`, + pageNum+1, len(page.GetParagraphs()), x1, y1, x2, y2)) + + // Process words within paragraph + for _, token := range para.GetLayout().GetTextAnchor().GetTextSegments() { + text := doc.Text[token.GetStartIndex():token.GetEndIndex()] + if text == "" { + continue + } + + hocr.WriteString(fmt.Sprintf(` + %s`, text)) + } + + hocr.WriteString("\n
") + } + hocr.WriteString("\n.*" + + "Hello World.*
.*