mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-12 21:08:00 -05:00
Fix Vision OCR for OpenAI (#47)
## Summary by CodeRabbit - **New Features** - Updated environment variable descriptions for improved clarity on OCR processing options. - Enhanced the `doOCRViaLLM` method to support different LLM providers and improve image data handling. - **Bug Fixes** - Standardized error handling for better reporting across multiple methods. - **Documentation** - Revised Docker Compose section in `README.md` to reflect updated environment variable options.
This commit is contained in:
parent
d1f23de5a6
commit
5b3373743a
2 changed files with 20 additions and 7 deletions
|
@ -75,8 +75,8 @@ services:
|
|||
OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
|
||||
LLM_LANGUAGE: 'English' # Optional, default is 'English'
|
||||
OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
|
||||
VISION_LLM_PROVIDER: 'ollama' # Optional, for OCR
|
||||
VISION_LLM_MODEL: 'minicpm-v' # Optional, for OCR
|
||||
VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai
|
||||
VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
|
||||
LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
|
||||
volumes:
|
||||
- ./prompts:/app/prompts # Mount the prompts directory
|
||||
|
|
21
app_llm.go
21
app_llm.go
|
@ -3,6 +3,7 @@ package main
|
|||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
@ -81,13 +82,25 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
|
|||
|
||||
prompt := promptBuffer.String()
|
||||
|
||||
// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
|
||||
var parts []llms.ContentPart
|
||||
if strings.ToLower(visionLlmProvider) != "openai" {
|
||||
parts = []llms.ContentPart{
|
||||
llms.BinaryPart("image/jpeg", jpegBytes),
|
||||
llms.TextPart(prompt),
|
||||
}
|
||||
} else {
|
||||
base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
|
||||
parts = []llms.ContentPart{
|
||||
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
|
||||
llms.TextPart(prompt),
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the image to text
|
||||
completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{
|
||||
{
|
||||
Parts: []llms.ContentPart{
|
||||
llms.BinaryPart("image/jpeg", jpegBytes),
|
||||
llms.TextPart(prompt),
|
||||
},
|
||||
Parts: parts,
|
||||
Role: llms.ChatMessageTypeHuman,
|
||||
},
|
||||
})
|
||||
|
|
Loading…
Reference in a new issue