diff --git a/README.md b/README.md index c829818..6b00991 100644 --- a/README.md +++ b/README.md @@ -75,8 +75,8 @@ services: OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI LLM_LANGUAGE: 'English' # Optional, default is 'English' OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama - VISION_LLM_PROVIDER: 'ollama' # Optional, for OCR - VISION_LLM_MODEL: 'minicpm-v' # Optional, for OCR + VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai + VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error' volumes: - ./prompts:/app/prompts # Mount the prompts directory diff --git a/app_llm.go b/app_llm.go index b7228f6..a4a4c6b 100644 --- a/app_llm.go +++ b/app_llm.go @@ -3,6 +3,7 @@ package main import ( "bytes" "context" + "encoding/base64" "fmt" "strings" "sync" @@ -81,14 +82,26 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro prompt := promptBuffer.String() + // If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision + var parts []llms.ContentPart + if strings.ToLower(visionLlmProvider) != "openai" { + parts = []llms.ContentPart{ + llms.BinaryPart("image/jpeg", jpegBytes), + llms.TextPart(prompt), + } + } else { + base64Image := base64.StdEncoding.EncodeToString(jpegBytes) + parts = []llms.ContentPart{ + llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)), + llms.TextPart(prompt), + } + } + // Convert the image to text completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{ { - Parts: []llms.ContentPart{ - llms.BinaryPart("image/jpeg", jpegBytes), - llms.TextPart(prompt), - }, - Role: llms.ChatMessageTypeHuman, + Parts: parts, + Role: llms.ChatMessageTypeHuman, }, }) if err != nil {