Fix Vision OCR for OpenAI (#47)

## Summary by CodeRabbit

- **New Features**
	- Updated environment variable descriptions for improved clarity on OCR processing options.
	- Enhanced the `doOCRViaLLM` method to support different LLM providers and improve image data handling.

- **Bug Fixes**
	- Standardized error handling for better reporting across multiple methods. 

- **Documentation**
	- Revised Docker Compose section in `README.md` to reflect updated environment variable options.
This commit is contained in:
ccrlawrence 2024-11-13 04:47:25 +00:00 committed by GitHub
parent d1f23de5a6
commit 5b3373743a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 20 additions and 7 deletions

View file

@ -75,8 +75,8 @@ services:
OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
LLM_LANGUAGE: 'English' # Optional, default is 'English' LLM_LANGUAGE: 'English' # Optional, default is 'English'
OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
VISION_LLM_PROVIDER: 'ollama' # Optional, for OCR VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai
VISION_LLM_MODEL: 'minicpm-v' # Optional, for OCR VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error' LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
volumes: volumes:
- ./prompts:/app/prompts # Mount the prompts directory - ./prompts:/app/prompts # Mount the prompts directory

View file

@ -3,6 +3,7 @@ package main
import ( import (
"bytes" "bytes"
"context" "context"
"encoding/base64"
"fmt" "fmt"
"strings" "strings"
"sync" "sync"
@ -81,13 +82,25 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
prompt := promptBuffer.String() prompt := promptBuffer.String()
// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
var parts []llms.ContentPart
if strings.ToLower(visionLlmProvider) != "openai" {
parts = []llms.ContentPart{
llms.BinaryPart("image/jpeg", jpegBytes),
llms.TextPart(prompt),
}
} else {
base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
parts = []llms.ContentPart{
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
llms.TextPart(prompt),
}
}
// Convert the image to text // Convert the image to text
completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{ completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{
{ {
Parts: []llms.ContentPart{ Parts: parts,
llms.BinaryPart("image/jpeg", jpegBytes),
llms.TextPart(prompt),
},
Role: llms.ChatMessageTypeHuman, Role: llms.ChatMessageTypeHuman,
}, },
}) })