mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-13 05:08:01 -05:00
feat(ocr): enhance OCR processing with structured results and hOCR support
This commit is contained in:
parent
18ba388af1
commit
d71d340eb5
6 changed files with 311 additions and 16 deletions
|
@ -92,3 +92,88 @@
|
|||
- E2E tests for web interface
|
||||
- Test fixtures and mocks
|
||||
- Playwright for frontend testing
|
||||
|
||||
## OCR System Patterns
|
||||
|
||||
### OCR Provider Architecture
|
||||
|
||||
#### 1. Provider Interface
|
||||
- Common interface for all OCR implementations
|
||||
- Methods for image processing
|
||||
- Configuration through standardized Config struct
|
||||
- Resource management patterns
|
||||
|
||||
#### 2. LLM Provider Implementation
|
||||
- Supports OpenAI and Ollama vision models
|
||||
- Base64 encoding for OpenAI requests
|
||||
- Binary format for Ollama requests
|
||||
- Template-based OCR prompts
|
||||
|
||||
#### 3. Google Document AI Provider
|
||||
- Enterprise-grade OCR processing
|
||||
- MIME type validation
|
||||
- Processor configuration via environment
|
||||
- Regional endpoint support
|
||||
|
||||
### Logging Patterns
|
||||
|
||||
#### 1. Provider Initialization
|
||||
```
|
||||
[INFO] Initializing OCR provider: llm
|
||||
[INFO] Using LLM OCR provider (provider=ollama, model=minicpm-v)
|
||||
```
|
||||
|
||||
#### 2. Processing Logs
|
||||
```
|
||||
[DEBUG] Starting OCR processing
|
||||
[DEBUG] Image dimensions (width=800, height=1200)
|
||||
[DEBUG] Using binary image format for non-OpenAI provider
|
||||
[DEBUG] Sending request to vision model
|
||||
[INFO] Successfully processed image (content_length=1536)
|
||||
```
|
||||
|
||||
#### 3. Error Logging
|
||||
```
|
||||
[ERROR] Failed to decode image: invalid format
|
||||
[ERROR] Unsupported file type: image/webp
|
||||
[ERROR] Failed to get response from vision model
|
||||
```
|
||||
|
||||
### Error Handling Patterns
|
||||
|
||||
#### 1. Configuration Validation
|
||||
- Required parameter checks
|
||||
- Environment variable validation
|
||||
- Provider-specific configuration
|
||||
- Connection testing
|
||||
|
||||
#### 2. Processing Errors
|
||||
- Image format validation
|
||||
- MIME type checking
|
||||
- Content processing errors
|
||||
- Provider-specific error handling
|
||||
|
||||
#### 3. Error Propagation
|
||||
- Detailed error contexts
|
||||
- Original error wrapping
|
||||
- Logging with error context
|
||||
- Recovery mechanisms
|
||||
|
||||
### Processing Flow
|
||||
|
||||
#### 1. Document Processing
|
||||
```
|
||||
Document Tagged → OCR Provider Selected → Image Processing → Text Extraction → Content Update
|
||||
```
|
||||
|
||||
#### 2. Provider Selection
|
||||
```
|
||||
Config Check → Provider Initialization → Resource Setup → Provider Ready
|
||||
```
|
||||
|
||||
#### 3. Error Recovery
|
||||
```
|
||||
Error Detection → Logging → Cleanup → Error Propagation
|
||||
```
|
||||
|
||||
These patterns ensure consistent behavior across OCR providers while maintaining proper logging and error handling throughout the system.
|
||||
|
|
13
ocr.go
13
ocr.go
|
@ -36,13 +36,20 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string,
|
|||
return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
|
||||
}
|
||||
|
||||
ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent)
|
||||
result, err := app.ocrProvider.ProcessImage(ctx, imageContent)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
|
||||
}
|
||||
pageLogger.Debug("OCR completed for page")
|
||||
if result == nil {
|
||||
pageLogger.Error("Got nil result from OCR provider")
|
||||
return "", fmt.Errorf("error performing OCR for document %d, page %d: nil result", documentID, i+1)
|
||||
}
|
||||
|
||||
ocrTexts = append(ocrTexts, ocrText)
|
||||
pageLogger.WithField("has_hocr", result.HOCR != "").
|
||||
WithField("metadata", result.Metadata).
|
||||
Debug("OCR completed for page")
|
||||
|
||||
ocrTexts = append(ocrTexts, result.Text)
|
||||
}
|
||||
|
||||
docLogger.Info("OCR processing completed successfully")
|
||||
|
|
|
@ -3,6 +3,7 @@ package ocr
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
documentai "cloud.google.com/go/documentai/apiv1"
|
||||
"cloud.google.com/go/documentai/apiv1/documentaipb"
|
||||
|
@ -46,7 +47,7 @@ func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
|
|||
return provider, nil
|
||||
}
|
||||
|
||||
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
||||
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
|
||||
logger := log.WithFields(logrus.Fields{
|
||||
"project_id": p.projectID,
|
||||
"location": p.location,
|
||||
|
@ -60,7 +61,7 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
|
|||
|
||||
if !isImageMIMEType(mtype.String()) {
|
||||
logger.WithField("mime_type", mtype.String()).Error("Unsupported file type")
|
||||
return "", fmt.Errorf("unsupported file type: %s", mtype.String())
|
||||
return nil, fmt.Errorf("unsupported file type: %s", mtype.String())
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID)
|
||||
|
@ -79,21 +80,40 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
|
|||
resp, err := p.client.ProcessDocument(ctx, req)
|
||||
if err != nil {
|
||||
logger.WithError(err).Error("Failed to process document")
|
||||
return "", fmt.Errorf("error processing document: %w", err)
|
||||
return nil, fmt.Errorf("error processing document: %w", err)
|
||||
}
|
||||
|
||||
if resp == nil || resp.Document == nil {
|
||||
logger.Error("Received nil response or document from Document AI")
|
||||
return "", fmt.Errorf("received nil response or document from Document AI")
|
||||
return nil, fmt.Errorf("received nil response or document from Document AI")
|
||||
}
|
||||
|
||||
if resp.Document.Error != nil {
|
||||
logger.WithField("error", resp.Document.Error.Message).Error("Document processing error")
|
||||
return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
|
||||
return nil, fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
|
||||
}
|
||||
|
||||
logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document")
|
||||
return resp.Document.Text, nil
|
||||
result := &OCRResult{
|
||||
Text: resp.Document.Text,
|
||||
Metadata: map[string]string{
|
||||
"provider": "google_docai",
|
||||
"mime_type": mtype.String(),
|
||||
"lang_code": resp.Document.GetPages()[0].GetDetectedLanguages()[0].GetLanguageCode(),
|
||||
"page_count": fmt.Sprintf("%d", len(resp.Document.GetPages())),
|
||||
"processor_id": p.processorID,
|
||||
},
|
||||
}
|
||||
|
||||
// Add hOCR output if available
|
||||
if len(resp.Document.GetPages()) > 0 {
|
||||
hocr := generateHOCR(resp.Document)
|
||||
if hocr != "" {
|
||||
result.HOCR = hocr
|
||||
}
|
||||
}
|
||||
|
||||
logger.WithField("content_length", len(result.Text)).Info("Successfully processed document")
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// isImageMIMEType checks if the given MIME type is a supported image type
|
||||
|
@ -109,6 +129,68 @@ func isImageMIMEType(mimeType string) bool {
|
|||
return supportedTypes[mimeType]
|
||||
}
|
||||
|
||||
// generateHOCR converts Document AI response to hOCR format
|
||||
func generateHOCR(doc *documentaipb.Document) string {
|
||||
if len(doc.GetPages()) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
var hocr strings.Builder
|
||||
hocr.WriteString(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head>
|
||||
<title>OCR Output</title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
||||
<meta name='ocr-system' content='google-docai' />
|
||||
</head>
|
||||
<body>`)
|
||||
|
||||
for pageNum, page := range doc.GetPages() {
|
||||
pageWidth := page.GetDimension().GetWidth()
|
||||
pageHeight := page.GetDimension().GetHeight()
|
||||
|
||||
hocr.WriteString(fmt.Sprintf(`
|
||||
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
|
||||
pageNum+1, int(pageWidth), int(pageHeight)))
|
||||
|
||||
// Process paragraphs
|
||||
for _, para := range page.GetParagraphs() {
|
||||
paraBox := para.GetLayout().GetBoundingPoly().GetNormalizedVertices()
|
||||
if len(paraBox) < 4 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Convert normalized coordinates to absolute
|
||||
x1 := int(paraBox[0].GetX() * pageWidth)
|
||||
y1 := int(paraBox[0].GetY() * pageHeight)
|
||||
x2 := int(paraBox[2].GetX() * pageWidth)
|
||||
y2 := int(paraBox[2].GetY() * pageHeight)
|
||||
|
||||
hocr.WriteString(fmt.Sprintf(`
|
||||
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
|
||||
pageNum+1, len(page.GetParagraphs()), x1, y1, x2, y2))
|
||||
|
||||
// Process words within paragraph
|
||||
for _, token := range para.GetLayout().GetTextAnchor().GetTextSegments() {
|
||||
text := doc.Text[token.GetStartIndex():token.GetEndIndex()]
|
||||
if text == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
hocr.WriteString(fmt.Sprintf(`
|
||||
<span class='ocrx_word'>%s</span>`, text))
|
||||
}
|
||||
|
||||
hocr.WriteString("\n </p>")
|
||||
}
|
||||
hocr.WriteString("\n </div>")
|
||||
}
|
||||
|
||||
hocr.WriteString("\n</body>\n</html>")
|
||||
return hocr.String()
|
||||
}
|
||||
|
||||
// Close releases resources used by the provider
|
||||
func (p *GoogleDocAIProvider) Close() error {
|
||||
if p.client != nil {
|
||||
|
|
99
ocr/google_docai_provider_test.go
Normal file
99
ocr/google_docai_provider_test.go
Normal file
|
@ -0,0 +1,99 @@
|
|||
package ocr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"cloud.google.com/go/documentai/apiv1/documentaipb"
|
||||
)
|
||||
|
||||
func TestGenerateHOCR(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
doc *documentaipb.Document
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "empty document",
|
||||
doc: &documentaipb.Document{},
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "single page with one paragraph",
|
||||
doc: &documentaipb.Document{
|
||||
Text: "Hello World",
|
||||
Pages: []*documentaipb.Document_Page{
|
||||
{
|
||||
Dimension: &documentaipb.Document_Page_Dimension{
|
||||
Width: 800,
|
||||
Height: 600,
|
||||
},
|
||||
Paragraphs: []*documentaipb.Document_Page_Paragraph{
|
||||
{
|
||||
Layout: &documentaipb.Document_Page_Layout{
|
||||
BoundingPoly: &documentaipb.BoundingPoly{
|
||||
NormalizedVertices: []*documentaipb.NormalizedVertex{
|
||||
{X: 0.1, Y: 0.1},
|
||||
{X: 0.9, Y: 0.1},
|
||||
{X: 0.9, Y: 0.2},
|
||||
{X: 0.1, Y: 0.2},
|
||||
},
|
||||
},
|
||||
TextAnchor: &documentaipb.Document_TextAnchor{
|
||||
TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{
|
||||
{
|
||||
StartIndex: 0,
|
||||
EndIndex: 11,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expected: "(?s).*<div class='ocr_page' id='page_1' title='image;bbox 0 0 800 600'>.*" +
|
||||
"<p class='ocr_par' id='par_1_1' title='bbox 80 60 720 120'>.*" +
|
||||
"<span class='ocrx_word'>Hello World</span>.*</p>.*</div>.*",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := generateHOCR(tt.doc)
|
||||
|
||||
if tt.expected == "" {
|
||||
if result != "" {
|
||||
t.Errorf("expected empty string, got %v", result)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
matched, err := regexp.MatchString(tt.expected, result)
|
||||
if err != nil {
|
||||
t.Fatalf("error matching regex: %v", err)
|
||||
}
|
||||
if !matched {
|
||||
t.Errorf("expected to match regex %v\ngot: %v", tt.expected, result)
|
||||
}
|
||||
|
||||
// Verify basic hOCR structure
|
||||
if !strings.Contains(result, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") {
|
||||
t.Error("missing XML declaration")
|
||||
}
|
||||
if !strings.Contains(result, "<html xmlns=\"http://www.w3.org/1999/xhtml\"") {
|
||||
t.Error("missing HTML namespace")
|
||||
}
|
||||
if !strings.Contains(result, "<meta name='ocr-system' content='google-docai'") {
|
||||
t.Error("missing OCR system metadata")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func testContext() context.Context {
|
||||
return context.Background()
|
||||
}
|
|
@ -60,7 +60,7 @@ func newLLMProvider(config Config) (*LLMProvider, error) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
||||
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error) {
|
||||
logger := log.WithFields(logrus.Fields{
|
||||
"provider": p.provider,
|
||||
"model": p.model,
|
||||
|
@ -71,7 +71,7 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
|
|||
img, _, err := image.Decode(bytes.NewReader(imageContent))
|
||||
if err != nil {
|
||||
logger.WithError(err).Error("Failed to decode image")
|
||||
return "", fmt.Errorf("error decoding image: %w", err)
|
||||
return nil, fmt.Errorf("error decoding image: %w", err)
|
||||
}
|
||||
bounds := img.Bounds()
|
||||
logger.WithFields(logrus.Fields{
|
||||
|
@ -106,11 +106,18 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (st
|
|||
})
|
||||
if err != nil {
|
||||
logger.WithError(err).Error("Failed to get response from vision model")
|
||||
return "", fmt.Errorf("error getting response from LLM: %w", err)
|
||||
return nil, fmt.Errorf("error getting response from LLM: %w", err)
|
||||
}
|
||||
|
||||
logger.WithField("content_length", len(completion.Choices[0].Content)).Info("Successfully processed image")
|
||||
return completion.Choices[0].Content, nil
|
||||
result := &OCRResult{
|
||||
Text: completion.Choices[0].Content,
|
||||
Metadata: map[string]string{
|
||||
"provider": p.provider,
|
||||
"model": p.model,
|
||||
},
|
||||
}
|
||||
logger.WithField("content_length", len(result.Text)).Info("Successfully processed image")
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// createOpenAIClient creates a new OpenAI vision model client
|
||||
|
|
|
@ -9,9 +9,21 @@ import (
|
|||
|
||||
var log = logrus.New()
|
||||
|
||||
// OCRResult holds the output from OCR processing
|
||||
type OCRResult struct {
|
||||
// Plain text output (required)
|
||||
Text string
|
||||
|
||||
// hOCR output (optional, if provider supports it)
|
||||
HOCR string
|
||||
|
||||
// Additional provider-specific metadata
|
||||
Metadata map[string]string
|
||||
}
|
||||
|
||||
// Provider defines the interface for OCR processing
|
||||
type Provider interface {
|
||||
ProcessImage(ctx context.Context, imageContent []byte) (string, error)
|
||||
ProcessImage(ctx context.Context, imageContent []byte) (*OCRResult, error)
|
||||
}
|
||||
|
||||
// Config holds the OCR provider configuration
|
||||
|
@ -27,6 +39,9 @@ type Config struct {
|
|||
// LLM settings (from existing config)
|
||||
VisionLLMProvider string
|
||||
VisionLLMModel string
|
||||
|
||||
// OCR output options
|
||||
EnableHOCR bool // Whether to request hOCR output if supported by the provider
|
||||
}
|
||||
|
||||
// NewProvider creates a new OCR provider based on configuration
|
||||
|
|
Loading…
Reference in a new issue