Fix Vision OCR for OpenAI (#47)

## Summary by CodeRabbit - **New Features** - Updated environment variable descriptions for improved clarity on OCR processing options. - Enhanced the `doOCRViaLLM` method to support different LLM providers and improve image data handling. - **Bug Fixes** - Standardized error handling for better reporting across multiple methods. - **Documentation** - Revised Docker Compose section in `README.md` to reflect updated environment variable options.
2025-03-12 21:08:00 -05:00 · 2024-11-13 04:47:25 +00:00 · 2024-11-13 04:47:25 +00:00 · 5b3373743a
commit 5b3373743a
parent d1f23de5a6
2 changed files with 20 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -75,8 +75,8 @@ services:
      OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
      LLM_LANGUAGE: 'English' # Optional, default is 'English'
      OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
-      VISION_LLM_PROVIDER: 'ollama' # Optional, for OCR
-      VISION_LLM_MODEL: 'minicpm-v' # Optional, for OCR
+      VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai
+      VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
      LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
    volumes:
      - ./prompts:/app/prompts # Mount the prompts directory
--- a/app_llm.go
+++ b/app_llm.go
@ -3,6 +3,7 @@ package main
 import (
 	"bytes"
 	"context"
+	"encoding/base64"
 	"fmt"
 	"strings"
 	"sync"
@ -81,13 +82,25 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro

 	prompt := promptBuffer.String()

+	// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
+	var parts []llms.ContentPart
+	if strings.ToLower(visionLlmProvider) != "openai" {
+		parts = []llms.ContentPart{
+			llms.BinaryPart("image/jpeg", jpegBytes),
+			llms.TextPart(prompt),
+		}
+	} else {
+		base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
+		parts = []llms.ContentPart{
+			llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
+			llms.TextPart(prompt),
+		}
+	}
+
 	// Convert the image to text
 	completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{
 		{
-			Parts: []llms.ContentPart{
-				llms.BinaryPart("image/jpeg", jpegBytes),
-				llms.TextPart(prompt),
-			},
+			Parts: parts,
 			Role:  llms.ChatMessageTypeHuman,
 		},
 	})