mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-14 05:38:01 -05:00
feat(logging): add structured logging to Google Document AI and LLM providers
This commit is contained in:
parent
0a0be31678
commit
686e37ae24
3 changed files with 124 additions and 45 deletions
|
@ -7,6 +7,7 @@ import (
|
||||||
documentai "cloud.google.com/go/documentai/apiv1"
|
documentai "cloud.google.com/go/documentai/apiv1"
|
||||||
"cloud.google.com/go/documentai/apiv1/documentaipb"
|
"cloud.google.com/go/documentai/apiv1/documentaipb"
|
||||||
"github.com/gabriel-vasile/mimetype"
|
"github.com/gabriel-vasile/mimetype"
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
"google.golang.org/api/option"
|
"google.golang.org/api/option"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -19,26 +20,46 @@ type GoogleDocAIProvider struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
|
func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
|
||||||
|
logger := log.WithFields(logrus.Fields{
|
||||||
|
"location": config.GoogleLocation,
|
||||||
|
"processor_id": config.GoogleProcessorID,
|
||||||
|
})
|
||||||
|
logger.Info("Creating new Google Document AI provider")
|
||||||
|
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
endpoint := fmt.Sprintf("%s-documentai.googleapis.com:443", config.GoogleLocation)
|
endpoint := fmt.Sprintf("%s-documentai.googleapis.com:443", config.GoogleLocation)
|
||||||
|
|
||||||
client, err := documentai.NewDocumentProcessorClient(ctx, option.WithEndpoint(endpoint))
|
client, err := documentai.NewDocumentProcessorClient(ctx, option.WithEndpoint(endpoint))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
logger.WithError(err).Error("Failed to create Document AI client")
|
||||||
return nil, fmt.Errorf("error creating Document AI client: %w", err)
|
return nil, fmt.Errorf("error creating Document AI client: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return &GoogleDocAIProvider{
|
provider := &GoogleDocAIProvider{
|
||||||
projectID: config.GoogleProjectID,
|
projectID: config.GoogleProjectID,
|
||||||
location: config.GoogleLocation,
|
location: config.GoogleLocation,
|
||||||
processorID: config.GoogleProcessorID,
|
processorID: config.GoogleProcessorID,
|
||||||
client: client,
|
client: client,
|
||||||
}, nil
|
}
|
||||||
|
|
||||||
|
logger.Info("Successfully initialized Google Document AI provider")
|
||||||
|
return provider, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
||||||
|
logger := log.WithFields(logrus.Fields{
|
||||||
|
"project_id": p.projectID,
|
||||||
|
"location": p.location,
|
||||||
|
"processor_id": p.processorID,
|
||||||
|
})
|
||||||
|
logger.Debug("Starting Document AI processing")
|
||||||
|
|
||||||
// Detect MIME type
|
// Detect MIME type
|
||||||
mtype := mimetype.Detect(imageContent)
|
mtype := mimetype.Detect(imageContent)
|
||||||
|
logger.WithField("mime_type", mtype.String()).Debug("Detected file type")
|
||||||
|
|
||||||
if !isImageMIMEType(mtype.String()) {
|
if !isImageMIMEType(mtype.String()) {
|
||||||
|
logger.WithField("mime_type", mtype.String()).Error("Unsupported file type")
|
||||||
return "", fmt.Errorf("unsupported file type: %s", mtype.String())
|
return "", fmt.Errorf("unsupported file type: %s", mtype.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -54,30 +75,38 @@ func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []b
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.Debug("Sending request to Document AI")
|
||||||
resp, err := p.client.ProcessDocument(ctx, req)
|
resp, err := p.client.ProcessDocument(ctx, req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
logger.WithError(err).Error("Failed to process document")
|
||||||
return "", fmt.Errorf("error processing document: %w", err)
|
return "", fmt.Errorf("error processing document: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp == nil || resp.Document == nil {
|
if resp == nil || resp.Document == nil {
|
||||||
|
logger.Error("Received nil response or document from Document AI")
|
||||||
return "", fmt.Errorf("received nil response or document from Document AI")
|
return "", fmt.Errorf("received nil response or document from Document AI")
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.Document.Error != nil {
|
if resp.Document.Error != nil {
|
||||||
|
logger.WithField("error", resp.Document.Error.Message).Error("Document processing error")
|
||||||
return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
|
return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.WithField("content_length", len(resp.Document.Text)).Info("Successfully processed document")
|
||||||
return resp.Document.Text, nil
|
return resp.Document.Text, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// isImageMIMEType checks if the given MIME type is a supported image type
|
// isImageMIMEType checks if the given MIME type is a supported image type
|
||||||
func isImageMIMEType(mimeType string) bool {
|
func isImageMIMEType(mimeType string) bool {
|
||||||
switch mimeType {
|
supportedTypes := map[string]bool{
|
||||||
case "image/jpeg", "image/jpg", "image/png", "image/tiff", "image/bmp", "application/pdf":
|
"image/jpeg": true,
|
||||||
return true
|
"image/jpg": true,
|
||||||
default:
|
"image/png": true,
|
||||||
return false
|
"image/tiff": true,
|
||||||
|
"image/bmp": true,
|
||||||
|
"application/pdf": true,
|
||||||
}
|
}
|
||||||
|
return supportedTypes[mimeType]
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close releases resources used by the provider
|
// Close releases resources used by the provider
|
||||||
|
|
|
@ -11,6 +11,7 @@ import (
|
||||||
|
|
||||||
_ "image/jpeg"
|
_ "image/jpeg"
|
||||||
|
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
"github.com/tmc/langchaingo/llms"
|
"github.com/tmc/langchaingo/llms"
|
||||||
"github.com/tmc/langchaingo/llms/ollama"
|
"github.com/tmc/langchaingo/llms/ollama"
|
||||||
"github.com/tmc/langchaingo/llms/openai"
|
"github.com/tmc/langchaingo/llms/openai"
|
||||||
|
@ -25,22 +26,32 @@ type LLMProvider struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
func newLLMProvider(config Config) (*LLMProvider, error) {
|
func newLLMProvider(config Config) (*LLMProvider, error) {
|
||||||
|
logger := log.WithFields(logrus.Fields{
|
||||||
|
"provider": config.VisionLLMProvider,
|
||||||
|
"model": config.VisionLLMModel,
|
||||||
|
})
|
||||||
|
logger.Info("Creating new LLM OCR provider")
|
||||||
|
|
||||||
var model llms.Model
|
var model llms.Model
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
switch strings.ToLower(config.VisionLLMProvider) {
|
switch strings.ToLower(config.VisionLLMProvider) {
|
||||||
case "openai":
|
case "openai":
|
||||||
|
logger.Debug("Initializing OpenAI vision model")
|
||||||
model, err = createOpenAIClient(config)
|
model, err = createOpenAIClient(config)
|
||||||
case "ollama":
|
case "ollama":
|
||||||
|
logger.Debug("Initializing Ollama vision model")
|
||||||
model, err = createOllamaClient(config)
|
model, err = createOllamaClient(config)
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unsupported vision LLM provider: %s", config.VisionLLMProvider)
|
return nil, fmt.Errorf("unsupported vision LLM provider: %s", config.VisionLLMProvider)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
logger.WithError(err).Error("Failed to create vision LLM client")
|
||||||
return nil, fmt.Errorf("error creating vision LLM client: %w", err)
|
return nil, fmt.Errorf("error creating vision LLM client: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.Info("Successfully initialized LLM OCR provider")
|
||||||
return &LLMProvider{
|
return &LLMProvider{
|
||||||
provider: config.VisionLLMProvider,
|
provider: config.VisionLLMProvider,
|
||||||
model: config.VisionLLMModel,
|
model: config.VisionLLMModel,
|
||||||
|
@ -49,14 +60,68 @@ func newLLMProvider(config Config) (*LLMProvider, error) {
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
||||||
|
logger := log.WithFields(logrus.Fields{
|
||||||
|
"provider": p.provider,
|
||||||
|
"model": p.model,
|
||||||
|
})
|
||||||
|
logger.Debug("Starting OCR processing")
|
||||||
|
|
||||||
|
// Log the image dimensions
|
||||||
|
img, _, err := image.Decode(bytes.NewReader(imageContent))
|
||||||
|
if err != nil {
|
||||||
|
logger.WithError(err).Error("Failed to decode image")
|
||||||
|
return "", fmt.Errorf("error decoding image: %w", err)
|
||||||
|
}
|
||||||
|
bounds := img.Bounds()
|
||||||
|
logger.WithFields(logrus.Fields{
|
||||||
|
"width": bounds.Dx(),
|
||||||
|
"height": bounds.Dy(),
|
||||||
|
}).Debug("Image dimensions")
|
||||||
|
|
||||||
|
// Prepare content parts based on provider type
|
||||||
|
var parts []llms.ContentPart
|
||||||
|
if strings.ToLower(p.provider) != "openai" {
|
||||||
|
logger.Debug("Using binary image format for non-OpenAI provider")
|
||||||
|
parts = []llms.ContentPart{
|
||||||
|
llms.BinaryPart("image/jpeg", imageContent),
|
||||||
|
llms.TextPart(p.template),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.Debug("Using base64 image format for OpenAI provider")
|
||||||
|
base64Image := base64.StdEncoding.EncodeToString(imageContent)
|
||||||
|
parts = []llms.ContentPart{
|
||||||
|
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
|
||||||
|
llms.TextPart(p.template),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert the image to text
|
||||||
|
logger.Debug("Sending request to vision model")
|
||||||
|
completion, err := p.llm.GenerateContent(ctx, []llms.MessageContent{
|
||||||
|
{
|
||||||
|
Parts: parts,
|
||||||
|
Role: llms.ChatMessageTypeHuman,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
logger.WithError(err).Error("Failed to get response from vision model")
|
||||||
|
return "", fmt.Errorf("error getting response from LLM: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.WithField("content_length", len(completion.Choices[0].Content)).Info("Successfully processed image")
|
||||||
|
return completion.Choices[0].Content, nil
|
||||||
|
}
|
||||||
|
|
||||||
// createOpenAIClient creates a new OpenAI vision model client
|
// createOpenAIClient creates a new OpenAI vision model client
|
||||||
func createOpenAIClient(config Config) (llms.Model, error) {
|
func createOpenAIClient(config Config) (llms.Model, error) {
|
||||||
if os.Getenv("OPENAI_API_KEY") == "" {
|
apiKey := os.Getenv("OPENAI_API_KEY")
|
||||||
|
if apiKey == "" {
|
||||||
return nil, fmt.Errorf("OpenAI API key is not set")
|
return nil, fmt.Errorf("OpenAI API key is not set")
|
||||||
}
|
}
|
||||||
return openai.New(
|
return openai.New(
|
||||||
openai.WithModel(config.VisionLLMModel),
|
openai.WithModel(config.VisionLLMModel),
|
||||||
openai.WithToken(os.Getenv("OPENAI_API_KEY")),
|
openai.WithToken(apiKey),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,40 +137,4 @@ func createOllamaClient(config Config) (llms.Model, error) {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
|
||||||
// Decode image to validate format and get dimensions for logging
|
|
||||||
_, _, err := image.Decode(bytes.NewReader(imageContent))
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error decoding image: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prepare content parts based on provider type
|
|
||||||
var parts []llms.ContentPart
|
|
||||||
if strings.ToLower(p.provider) != "openai" {
|
|
||||||
parts = []llms.ContentPart{
|
|
||||||
llms.BinaryPart("image/jpeg", imageContent),
|
|
||||||
llms.TextPart(p.template),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
base64Image := base64.StdEncoding.EncodeToString(imageContent)
|
|
||||||
parts = []llms.ContentPart{
|
|
||||||
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
|
|
||||||
llms.TextPart(p.template),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert the image to text
|
|
||||||
completion, err := p.llm.GenerateContent(ctx, []llms.MessageContent{
|
|
||||||
{
|
|
||||||
Parts: parts,
|
|
||||||
Role: llms.ChatMessageTypeHuman,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error getting response from LLM: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return completion.Choices[0].Content, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
const defaultOCRPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format but without a code block.`
|
const defaultOCRPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format but without a code block.`
|
||||||
|
|
|
@ -3,8 +3,12 @@ package ocr
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var log = logrus.New()
|
||||||
|
|
||||||
// Provider defines the interface for OCR processing
|
// Provider defines the interface for OCR processing
|
||||||
type Provider interface {
|
type Provider interface {
|
||||||
ProcessImage(ctx context.Context, imageContent []byte) (string, error)
|
ProcessImage(ctx context.Context, imageContent []byte) (string, error)
|
||||||
|
@ -27,18 +31,35 @@ type Config struct {
|
||||||
|
|
||||||
// NewProvider creates a new OCR provider based on configuration
|
// NewProvider creates a new OCR provider based on configuration
|
||||||
func NewProvider(config Config) (Provider, error) {
|
func NewProvider(config Config) (Provider, error) {
|
||||||
|
log.Info("Initializing OCR provider: ", config.Provider)
|
||||||
|
|
||||||
switch config.Provider {
|
switch config.Provider {
|
||||||
case "google_docai":
|
case "google_docai":
|
||||||
if config.GoogleProjectID == "" || config.GoogleLocation == "" || config.GoogleProcessorID == "" {
|
if config.GoogleProjectID == "" || config.GoogleLocation == "" || config.GoogleProcessorID == "" {
|
||||||
return nil, fmt.Errorf("missing required Google Document AI configuration")
|
return nil, fmt.Errorf("missing required Google Document AI configuration")
|
||||||
}
|
}
|
||||||
|
log.WithFields(logrus.Fields{
|
||||||
|
"location": config.GoogleLocation,
|
||||||
|
"processor_id": config.GoogleProcessorID,
|
||||||
|
}).Info("Using Google Document AI provider")
|
||||||
return newGoogleDocAIProvider(config)
|
return newGoogleDocAIProvider(config)
|
||||||
|
|
||||||
case "llm":
|
case "llm":
|
||||||
if config.VisionLLMProvider == "" || config.VisionLLMModel == "" {
|
if config.VisionLLMProvider == "" || config.VisionLLMModel == "" {
|
||||||
return nil, fmt.Errorf("missing required LLM configuration")
|
return nil, fmt.Errorf("missing required LLM configuration")
|
||||||
}
|
}
|
||||||
|
log.WithFields(logrus.Fields{
|
||||||
|
"provider": config.VisionLLMProvider,
|
||||||
|
"model": config.VisionLLMModel,
|
||||||
|
}).Info("Using LLM OCR provider")
|
||||||
return newLLMProvider(config)
|
return newLLMProvider(config)
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unsupported OCR provider: %s", config.Provider)
|
return nil, fmt.Errorf("unsupported OCR provider: %s", config.Provider)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SetLogLevel sets the logging level for the OCR package
|
||||||
|
func SetLogLevel(level logrus.Level) {
|
||||||
|
log.SetLevel(level)
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue