From 5b3373743a9050ff5761eb89c648564457cea1f9 Mon Sep 17 00:00:00 2001 From: ccrlawrence Date: Wed, 13 Nov 2024 04:47:25 +0000 Subject: [PATCH] Fix Vision OCR for OpenAI (#47) ## Summary by CodeRabbit - **New Features** - Updated environment variable descriptions for improved clarity on OCR processing options. - Enhanced the `doOCRViaLLM` method to support different LLM providers and improve image data handling. - **Bug Fixes** - Standardized error handling for better reporting across multiple methods. - **Documentation** - Revised Docker Compose section in `README.md` to reflect updated environment variable options. --- README.md | 4 ++-- app_llm.go | 23 ++++++++++++++++++----- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index c829818..6b00991 100644 --- a/README.md +++ b/README.md @@ -75,8 +75,8 @@ services: OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI LLM_LANGUAGE: 'English' # Optional, default is 'English' OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama - VISION_LLM_PROVIDER: 'ollama' # Optional, for OCR - VISION_LLM_MODEL: 'minicpm-v' # Optional, for OCR + VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai + VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error' volumes: - ./prompts:/app/prompts # Mount the prompts directory diff --git a/app_llm.go b/app_llm.go index b7228f6..a4a4c6b 100644 --- a/app_llm.go +++ b/app_llm.go @@ -3,6 +3,7 @@ package main import ( "bytes" "context" + "encoding/base64" "fmt" "strings" "sync" @@ -81,14 +82,26 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro prompt := promptBuffer.String() + // If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision + var parts []llms.ContentPart + if strings.ToLower(visionLlmProvider) != "openai" { + parts = []llms.ContentPart{ + llms.BinaryPart("image/jpeg", jpegBytes), + llms.TextPart(prompt), + } + } else { + base64Image := base64.StdEncoding.EncodeToString(jpegBytes) + parts = []llms.ContentPart{ + llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)), + llms.TextPart(prompt), + } + } + // Convert the image to text completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{ { - Parts: []llms.ContentPart{ - llms.BinaryPart("image/jpeg", jpegBytes), - llms.TextPart(prompt), - }, - Role: llms.ChatMessageTypeHuman, + Parts: parts, + Role: llms.ChatMessageTypeHuman, }, }) if err != nil {