mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-12 21:08:00 -05:00
Play around with ollama and minicpm-v
This commit is contained in:
parent
3495e6baa6
commit
b607815803
1 changed files with 78 additions and 7 deletions
85
main.go
85
main.go
|
@ -29,6 +29,8 @@ var (
|
||||||
autoTag = "paperless-gpt-auto"
|
autoTag = "paperless-gpt-auto"
|
||||||
llmProvider = os.Getenv("LLM_PROVIDER")
|
llmProvider = os.Getenv("LLM_PROVIDER")
|
||||||
llmModel = os.Getenv("LLM_MODEL")
|
llmModel = os.Getenv("LLM_MODEL")
|
||||||
|
visionLlmProvider = os.Getenv("VISION_LLM_PROVIDER")
|
||||||
|
visionLlmModel = os.Getenv("VISION_LLM_MODEL")
|
||||||
|
|
||||||
// Templates
|
// Templates
|
||||||
titleTemplate *template.Template
|
titleTemplate *template.Template
|
||||||
|
@ -62,8 +64,9 @@ Be very selective and only choose the most relevant tags since too many tags wil
|
||||||
|
|
||||||
// App struct to hold dependencies
|
// App struct to hold dependencies
|
||||||
type App struct {
|
type App struct {
|
||||||
Client *PaperlessClient
|
Client *PaperlessClient
|
||||||
LLM llms.Model
|
LLM llms.Model
|
||||||
|
VisionLLM llms.Model
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -82,10 +85,17 @@ func main() {
|
||||||
log.Fatalf("Failed to create LLM client: %v", err)
|
log.Fatalf("Failed to create LLM client: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initialize Vision LLM
|
||||||
|
visionLlm, err := createVisionLLM()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to create Vision LLM client: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Initialize App with dependencies
|
// Initialize App with dependencies
|
||||||
app := &App{
|
app := &App{
|
||||||
Client: client,
|
Client: client,
|
||||||
LLM: llm,
|
LLM: llm,
|
||||||
|
VisionLLM: visionLlm,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start background process for auto-tagging
|
// Start background process for auto-tagging
|
||||||
|
@ -139,9 +149,26 @@ func main() {
|
||||||
c.File("./web-app/dist/index.html")
|
c.File("./web-app/dist/index.html")
|
||||||
})
|
})
|
||||||
|
|
||||||
log.Println("Server started on port :8080")
|
// log.Println("Server started on port :8080")
|
||||||
if err := router.Run(":8080"); err != nil {
|
// if err := router.Run(":8080"); err != nil {
|
||||||
log.Fatalf("Failed to run server: %v", err)
|
// log.Fatalf("Failed to run server: %v", err)
|
||||||
|
// }
|
||||||
|
images, err := client.DownloadDocumentAsImages(context.Background(), Document{
|
||||||
|
// Insert the document ID here to test OCR
|
||||||
|
ID: 531,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to download document: %v", err)
|
||||||
|
}
|
||||||
|
for _, image := range images {
|
||||||
|
content, err := os.ReadFile(image)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to read image: %v", err)
|
||||||
|
}
|
||||||
|
_, err = app.doOCRViaLLM(context.Background(), content)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to OCR image: %v", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -438,6 +465,26 @@ func getLikelyLanguage() string {
|
||||||
return strings.Title(strings.ToLower(likelyLanguage))
|
return strings.Title(strings.ToLower(likelyLanguage))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, error) {
|
||||||
|
// Convert the image to text
|
||||||
|
completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{
|
||||||
|
{
|
||||||
|
Parts: []llms.ContentPart{
|
||||||
|
llms.BinaryPart("image/jpeg", jpegBytes),
|
||||||
|
llms.TextPart("Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format."),
|
||||||
|
},
|
||||||
|
Role: llms.ChatMessageTypeHuman,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error getting response from LLM: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
result := completion.Choices[0].Content
|
||||||
|
fmt.Println(result)
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
// getSuggestedTitle generates a suggested title for a document using the LLM
|
// getSuggestedTitle generates a suggested title for a document using the LLM
|
||||||
func (app *App) getSuggestedTitle(ctx context.Context, content string) (string, error) {
|
func (app *App) getSuggestedTitle(ctx context.Context, content string) (string, error) {
|
||||||
likelyLanguage := getLikelyLanguage()
|
likelyLanguage := getLikelyLanguage()
|
||||||
|
@ -542,6 +589,30 @@ func createLLM() (llms.Model, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func createVisionLLM() (llms.Model, error) {
|
||||||
|
switch strings.ToLower(visionLlmProvider) {
|
||||||
|
case "openai":
|
||||||
|
if openaiAPIKey == "" {
|
||||||
|
return nil, fmt.Errorf("OpenAI API key is not set")
|
||||||
|
}
|
||||||
|
return openai.New(
|
||||||
|
openai.WithModel(visionLlmModel),
|
||||||
|
openai.WithToken(openaiAPIKey),
|
||||||
|
)
|
||||||
|
case "ollama":
|
||||||
|
host := os.Getenv("OLLAMA_HOST")
|
||||||
|
if host == "" {
|
||||||
|
host = "http://127.0.0.1:11434"
|
||||||
|
}
|
||||||
|
return ollama.New(
|
||||||
|
ollama.WithModel(visionLlmModel),
|
||||||
|
ollama.WithServerURL(host),
|
||||||
|
)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported LLM provider: %s", llmProvider)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// getPromptsHandler handles the GET /api/prompts endpoint
|
// getPromptsHandler handles the GET /api/prompts endpoint
|
||||||
func getPromptsHandler(c *gin.Context) {
|
func getPromptsHandler(c *gin.Context) {
|
||||||
templateMutex.RLock()
|
templateMutex.RLock()
|
||||||
|
|
Loading…
Reference in a new issue