From 5b3373743a9050ff5761eb89c648564457cea1f9 Mon Sep 17 00:00:00 2001
From: ccrlawrence <ccrlawrence@gmail.com>
Date: Wed, 13 Nov 2024 04:47:25 +0000
Subject: [PATCH] Fix Vision OCR for OpenAI (#47)

## Summary by CodeRabbit

- **New Features**
	- Updated environment variable descriptions for improved clarity on OCR processing options.
	- Enhanced the `doOCRViaLLM` method to support different LLM providers and improve image data handling.

- **Bug Fixes**
	- Standardized error handling for better reporting across multiple methods.

- **Documentation**
	- Revised Docker Compose section in `README.md` to reflect updated environment variable options.
---
 README.md  |  4 ++--
 app_llm.go | 23 ++++++++++++++++++-----
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index c829818..6b00991 100644
--- a/README.md
+++ b/README.md
@@ -75,8 +75,8 @@ services:
       OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
       LLM_LANGUAGE: 'English' # Optional, default is 'English'
       OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
-      VISION_LLM_PROVIDER: 'ollama' # Optional, for OCR
-      VISION_LLM_MODEL: 'minicpm-v' # Optional, for OCR
+      VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai
+      VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
       LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
     volumes:
       - ./prompts:/app/prompts # Mount the prompts directory
diff --git a/app_llm.go b/app_llm.go
index b7228f6..a4a4c6b 100644
--- a/app_llm.go
+++ b/app_llm.go
@@ -3,6 +3,7 @@ package main
 import (
 	"bytes"
 	"context"
+	"encoding/base64"
 	"fmt"
 	"strings"
 	"sync"
@@ -81,14 +82,26 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
 
 	prompt := promptBuffer.String()
 
+	// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
+	var parts []llms.ContentPart
+	if strings.ToLower(visionLlmProvider) != "openai" {
+		parts = []llms.ContentPart{
+			llms.BinaryPart("image/jpeg", jpegBytes),
+			llms.TextPart(prompt),
+		}
+	} else {
+		base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
+		parts = []llms.ContentPart{
+			llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
+			llms.TextPart(prompt),
+		}
+	}
+
 	// Convert the image to text
 	completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{
 		{
-			Parts: []llms.ContentPart{
-				llms.BinaryPart("image/jpeg", jpegBytes),
-				llms.TextPart(prompt),
-			},
-			Role: llms.ChatMessageTypeHuman,
+			Parts: parts,
+			Role:  llms.ChatMessageTypeHuman,
 		},
 	})
 	if err != nil {