mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-12 21:08:00 -05:00
Fix Vision OCR for OpenAI (#47)
## Summary by CodeRabbit - **New Features** - Updated environment variable descriptions for improved clarity on OCR processing options. - Enhanced the `doOCRViaLLM` method to support different LLM providers and improve image data handling. - **Bug Fixes** - Standardized error handling for better reporting across multiple methods. - **Documentation** - Revised Docker Compose section in `README.md` to reflect updated environment variable options.
This commit is contained in:
parent
d1f23de5a6
commit
5b3373743a
2 changed files with 20 additions and 7 deletions
|
@ -75,8 +75,8 @@ services:
|
||||||
OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
|
OPENAI_API_KEY: 'your_openai_api_key' # Required if using OpenAI
|
||||||
LLM_LANGUAGE: 'English' # Optional, default is 'English'
|
LLM_LANGUAGE: 'English' # Optional, default is 'English'
|
||||||
OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
|
OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
|
||||||
VISION_LLM_PROVIDER: 'ollama' # Optional, for OCR
|
VISION_LLM_PROVIDER: 'ollama' # Optional (for OCR) - ollama or openai
|
||||||
VISION_LLM_MODEL: 'minicpm-v' # Optional, for OCR
|
VISION_LLM_MODEL: 'minicpm-v' # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
|
||||||
LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
|
LOG_LEVEL: 'info' # Optional or 'debug', 'warn', 'error'
|
||||||
volumes:
|
volumes:
|
||||||
- ./prompts:/app/prompts # Mount the prompts directory
|
- ./prompts:/app/prompts # Mount the prompts directory
|
||||||
|
|
23
app_llm.go
23
app_llm.go
|
@ -3,6 +3,7 @@ package main
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/base64"
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
@ -81,14 +82,26 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
|
||||||
|
|
||||||
prompt := promptBuffer.String()
|
prompt := promptBuffer.String()
|
||||||
|
|
||||||
|
// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
|
||||||
|
var parts []llms.ContentPart
|
||||||
|
if strings.ToLower(visionLlmProvider) != "openai" {
|
||||||
|
parts = []llms.ContentPart{
|
||||||
|
llms.BinaryPart("image/jpeg", jpegBytes),
|
||||||
|
llms.TextPart(prompt),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
|
||||||
|
parts = []llms.ContentPart{
|
||||||
|
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
|
||||||
|
llms.TextPart(prompt),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Convert the image to text
|
// Convert the image to text
|
||||||
completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{
|
completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{
|
||||||
{
|
{
|
||||||
Parts: []llms.ContentPart{
|
Parts: parts,
|
||||||
llms.BinaryPart("image/jpeg", jpegBytes),
|
Role: llms.ChatMessageTypeHuman,
|
||||||
llms.TextPart(prompt),
|
|
||||||
},
|
|
||||||
Role: llms.ChatMessageTypeHuman,
|
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
Loading…
Reference in a new issue