mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-12 21:08:00 -05:00
feat(ocr): implement OCR provider interface and add Google Document AI and LLM providers
This commit is contained in:
parent
b1a7b9992d
commit
c19c2f878a
8 changed files with 399 additions and 29 deletions
26
README.md
26
README.md
|
@ -101,14 +101,28 @@ services:
|
|||
OPENAI_API_KEY: 'your_openai_api_key'
|
||||
# Optional - OPENAI_BASE_URL: 'https://litellm.yourinstallationof.it.com/v1'
|
||||
LLM_LANGUAGE: 'English' # Optional, default: English
|
||||
|
||||
# OCR Configuration - Choose one:
|
||||
# Option 1: LLM-based OCR
|
||||
OCR_PROVIDER: 'llm' # Default OCR provider
|
||||
VISION_LLM_PROVIDER: 'ollama' # openai or ollama
|
||||
VISION_LLM_MODEL: 'minicpm-v' # minicpm-v (ollama) or gpt-4v (openai)
|
||||
OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
|
||||
VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama
|
||||
VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc.
|
||||
|
||||
# Option 2: Google Document AI
|
||||
# OCR_PROVIDER: 'google_docai' # Use Google Document AI
|
||||
# GOOGLE_PROJECT_ID: 'your-project' # Your GCP project ID
|
||||
# GOOGLE_LOCATION: 'us' # Document AI region
|
||||
# GOOGLE_PROCESSOR_ID: 'processor-id' # Your processor ID
|
||||
# GOOGLE_APPLICATION_CREDENTIALS: '/app/credentials.json' # Path to service account key
|
||||
|
||||
AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto
|
||||
OCR_LIMIT_PAGES: '5' # Optional, default: 5. Set to 0 for no limit.
|
||||
LOG_LEVEL: 'info' # Optional: debug, warn, error
|
||||
volumes:
|
||||
- ./prompts:/app/prompts # Mount the prompts directory
|
||||
# For Google Document AI:
|
||||
# - ./credentials.json:/app/credentials.json
|
||||
ports:
|
||||
- "8080:8080"
|
||||
depends_on:
|
||||
|
@ -169,8 +183,12 @@ services:
|
|||
| `OPENAI_BASE_URL` | OpenAI base URL (optional, if using a custom OpenAI compatible service like LiteLLM). | No |
|
||||
| `LLM_LANGUAGE` | Likely language for documents (e.g. `English`). Default: `English`. | No |
|
||||
| `OLLAMA_HOST` | Ollama server URL (e.g. `http://host.docker.internal:11434`). | No |
|
||||
| `VISION_LLM_PROVIDER` | AI backend for OCR (`openai` or `ollama`). | No |
|
||||
| `VISION_LLM_MODEL` | Model name for OCR (e.g. `minicpm-v`). | No |
|
||||
| `OCR_PROVIDER` | OCR provider to use (`llm` or `google_docai`). Default: `llm`. | No |
|
||||
| `VISION_LLM_PROVIDER` | AI backend for LLM OCR (`openai` or `ollama`). Required if OCR_PROVIDER is `llm`. | Cond. |
|
||||
| `VISION_LLM_MODEL` | Model name for LLM OCR (e.g. `minicpm-v`). Required if OCR_PROVIDER is `llm`. | Cond. |
|
||||
| `GOOGLE_PROJECT_ID` | Google Cloud project ID. Required if OCR_PROVIDER is `google_docai`. | Cond. |
|
||||
| `GOOGLE_LOCATION` | Google Cloud region (e.g. `us`, `eu`). Required if OCR_PROVIDER is `google_docai`. | Cond. |
|
||||
| `GOOGLE_PROCESSOR_ID` | Document AI processor ID. Required if OCR_PROVIDER is `google_docai`. | Cond. |
|
||||
| `AUTO_OCR_TAG` | Tag for automatically processing docs with OCR. Default: `paperless-gpt-ocr-auto`. | No |
|
||||
| `LOG_LEVEL` | Application log level (`info`, `debug`, `warn`, `error`). Default: `info`. | No |
|
||||
| `LISTEN_INTERFACE` | Network interface to listen on. Default: `:8080`. | No |
|
||||
|
|
57
go.mod
57
go.mod
|
@ -5,20 +5,28 @@ go 1.22.0
|
|||
toolchain go1.23.6
|
||||
|
||||
require (
|
||||
github.com/Masterminds/sprig/v3 v3.3.0
|
||||
github.com/fatih/color v1.18.0
|
||||
github.com/gen2brain/go-fitz v1.24.14
|
||||
github.com/gin-gonic/gin v1.10.0
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/stretchr/testify v1.10.0
|
||||
github.com/tmc/langchaingo v0.1.13-pre.1
|
||||
golang.org/x/sync v0.11.0
|
||||
gorm.io/driver/sqlite v1.5.7
|
||||
gorm.io/gorm v1.25.12
|
||||
github.com/Masterminds/sprig/v3 v3.3.0
|
||||
github.com/fatih/color v1.18.0
|
||||
github.com/gen2brain/go-fitz v1.24.14
|
||||
github.com/gin-gonic/gin v1.10.0
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/stretchr/testify v1.10.0
|
||||
github.com/tmc/langchaingo v0.1.13-pre.1
|
||||
golang.org/x/sync v0.11.0
|
||||
gorm.io/driver/sqlite v1.5.7
|
||||
gorm.io/gorm v1.25.12
|
||||
cloud.google.com/go/documentai v1.35.1
|
||||
google.golang.org/api v0.214.0
|
||||
github.com/gabriel-vasile/mimetype v1.4.3
|
||||
)
|
||||
|
||||
require (
|
||||
cloud.google.com/go v0.116.0 // indirect
|
||||
cloud.google.com/go/auth v0.13.0 // indirect
|
||||
cloud.google.com/go/auth/oauth2adapt v0.2.6 // indirect
|
||||
cloud.google.com/go/compute/metadata v0.6.0 // indirect
|
||||
cloud.google.com/go/longrunning v0.6.2 // indirect
|
||||
dario.cat/mergo v1.0.1 // indirect
|
||||
github.com/Masterminds/goutils v1.1.1 // indirect
|
||||
github.com/Masterminds/semver/v3 v3.3.0 // indirect
|
||||
|
@ -29,12 +37,18 @@ require (
|
|||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dlclark/regexp2 v1.10.0 // indirect
|
||||
github.com/ebitengine/purego v0.8.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
|
||||
github.com/gin-contrib/sse v0.1.0 // indirect
|
||||
github.com/go-logr/logr v1.4.2 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/go-playground/locales v0.14.1 // indirect
|
||||
github.com/go-playground/universal-translator v0.18.1 // indirect
|
||||
github.com/go-playground/validator/v10 v10.20.0 // indirect
|
||||
github.com/goccy/go-json v0.10.2 // indirect
|
||||
github.com/google/s2a-go v0.1.8 // indirect
|
||||
github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
|
||||
github.com/googleapis/gax-go/v2 v2.14.0 // indirect
|
||||
github.com/huandu/xstrings v1.5.0 // indirect
|
||||
github.com/jinzhu/inflection v1.0.0 // indirect
|
||||
github.com/jinzhu/now v1.1.5 // indirect
|
||||
|
@ -61,11 +75,22 @@ require (
|
|||
gitlab.com/golang-commonmark/markdown v0.0.0-20211110145824-bf3e522c626a // indirect
|
||||
gitlab.com/golang-commonmark/mdurl v0.0.0-20191124015652-932350d1cb84 // indirect
|
||||
gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 // indirect
|
||||
go.opentelemetry.io/otel v1.29.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.29.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.29.0 // indirect
|
||||
golang.org/x/arch v0.8.0 // indirect
|
||||
golang.org/x/crypto v0.29.0 // indirect
|
||||
golang.org/x/net v0.25.0 // indirect
|
||||
golang.org/x/sys v0.27.0 // indirect
|
||||
golang.org/x/text v0.20.0 // indirect
|
||||
google.golang.org/protobuf v1.34.1 // indirect
|
||||
golang.org/x/crypto v0.31.0 // indirect
|
||||
golang.org/x/net v0.33.0 // indirect
|
||||
golang.org/x/oauth2 v0.24.0 // indirect
|
||||
golang.org/x/sys v0.28.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
golang.org/x/time v0.8.0 // indirect
|
||||
google.golang.org/genproto v0.0.0-20241118233622-e639e219e697 // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20241118233622-e639e219e697 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576 // indirect
|
||||
google.golang.org/grpc v1.67.3 // indirect
|
||||
google.golang.org/protobuf v1.35.2 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
|
59
go.sum
59
go.sum
|
@ -1,3 +1,15 @@
|
|||
cloud.google.com/go v0.116.0 h1:B3fRrSDkLRt5qSHWe40ERJvhvnQwdZiHu0bJOpldweE=
|
||||
cloud.google.com/go v0.116.0/go.mod h1:cEPSRWPzZEswwdr9BxE6ChEn01dWlTaF05LiC2Xs70U=
|
||||
cloud.google.com/go/auth v0.13.0 h1:8Fu8TZy167JkW8Tj3q7dIkr2v4cndv41ouecJx0PAHs=
|
||||
cloud.google.com/go/auth v0.13.0/go.mod h1:COOjD9gwfKNKz+IIduatIhYJQIc0mG3H102r/EMxX6Q=
|
||||
cloud.google.com/go/auth/oauth2adapt v0.2.6 h1:V6a6XDu2lTwPZWOawrAa9HUK+DB2zfJyTuciBG5hFkU=
|
||||
cloud.google.com/go/auth/oauth2adapt v0.2.6/go.mod h1:AlmsELtlEBnaNTL7jCj8VQFLy6mbZv0s4Q7NGBeQ5E8=
|
||||
cloud.google.com/go/compute/metadata v0.6.0 h1:A6hENjEsCDtC1k8byVsgwvVcioamEHvZ4j01OwKxG9I=
|
||||
cloud.google.com/go/compute/metadata v0.6.0/go.mod h1:FjyFAW1MW0C203CEOMDTu3Dk1FlqW3Rga40jzHL4hfg=
|
||||
cloud.google.com/go/documentai v1.35.1 h1:52RfiUsoblXcE57CfKJGnITWLxRM30BcqNk/BKZl2LI=
|
||||
cloud.google.com/go/documentai v1.35.1/go.mod h1:WJjwUAQfwQPJORW8fjz7RODprMULDzEGLA2E6WxenFw=
|
||||
cloud.google.com/go/longrunning v0.6.2 h1:xjDfh1pQcWPEvnfjZmwjKQEcHnpz6lHjfy7Fo0MK+hc=
|
||||
cloud.google.com/go/longrunning v0.6.2/go.mod h1:k/vIs83RN4bE3YCswdXC5PFfWVILjm3hpEUlSko4PiI=
|
||||
dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s=
|
||||
dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
|
||||
github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI=
|
||||
|
@ -23,6 +35,8 @@ github.com/ebitengine/purego v0.8.0 h1:JbqvnEzRvPpxhCJzJJ2y0RbiZ8nyjccVUrSM3q+Gv
|
|||
github.com/ebitengine/purego v0.8.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
|
||||
github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
|
||||
github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
|
||||
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
|
||||
github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
|
||||
|
@ -33,6 +47,11 @@ github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE
|
|||
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
|
||||
github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
|
||||
github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y=
|
||||
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
|
||||
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
|
||||
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
|
||||
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
|
||||
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
|
||||
|
@ -46,8 +65,14 @@ github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MG
|
|||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM=
|
||||
github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw=
|
||||
github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA=
|
||||
github.com/googleapis/gax-go/v2 v2.14.0 h1:f+jMrjBPl+DL9nI4IQzLUxMq7XrAqFYB7hBPqMNIe8o=
|
||||
github.com/googleapis/gax-go/v2 v2.14.0/go.mod h1:lhBCnjdLrWRaPvLWhmc8IS24m9mr07qSYnHncrgo+zk=
|
||||
github.com/huandu/xstrings v1.5.0 h1:2ag3IFq9ZDANvthTwTiqSSZLjDc+BedvHPAp5tJy2TI=
|
||||
github.com/huandu/xstrings v1.5.0/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
|
||||
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
|
||||
|
@ -135,6 +160,16 @@ gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f h1:Wku8eEde
|
|||
gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f/go.mod h1:Tiuhl+njh/JIg0uS/sOJVYi0x2HEa5rc1OAaVsb5tAs=
|
||||
gitlab.com/opennota/wd v0.0.0-20180912061657-c5d65f63c638 h1:uPZaMiz6Sz0PZs3IZJWpU5qHKGNy///1pacZC9txiUI=
|
||||
gitlab.com/opennota/wd v0.0.0-20180912061657-c5d65f63c638/go.mod h1:EGRJaqe2eO9XGmFtQCvV3Lm9NLico3UhFwUpCG/+mVU=
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 h1:r6I7RJCN86bpD/FQwedZ0vSixDpwuWREjW9oRMsmqDc=
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0/go.mod h1:B9yO6b04uB80CzjedvewuqDhxJxi11s7/GtiGa8bAjI=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 h1:TT4fX+nBOA/+LUkobKGW1ydGcn+G3vRw9+g5HwCphpk=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0/go.mod h1:L7UH0GbB0p47T4Rri3uHjbpCFYrVrwc1I25QhNPiGK8=
|
||||
go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw=
|
||||
go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8=
|
||||
go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc=
|
||||
go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8=
|
||||
go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4=
|
||||
go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ=
|
||||
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
|
||||
golang.org/x/arch v0.8.0 h1:3wRIsP3pM4yUptoR96otTUOXI367OS0+c9eeRi9doIc=
|
||||
golang.org/x/arch v0.8.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
|
||||
|
@ -142,8 +177,14 @@ golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
|
|||
golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
|
||||
golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=
|
||||
golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=
|
||||
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
|
||||
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
|
||||
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
|
||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
|
||||
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||
golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE=
|
||||
golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
|
||||
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
|
||||
|
@ -156,12 +197,30 @@ golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
|
|||
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
|
||||
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
|
||||
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
|
||||
golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
|
||||
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
|
||||
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||
golang.org/x/time v0.8.0 h1:9i3RxcPv3PZnitoVGMPDKZSq1xW1gK1Xy3ArNOGZfEg=
|
||||
golang.org/x/time v0.8.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
google.golang.org/api v0.214.0 h1:h2Gkq07OYi6kusGOaT/9rnNljuXmqPnaig7WGPmKbwA=
|
||||
google.golang.org/api v0.214.0/go.mod h1:bYPpLG8AyeMWwDU6NXoB00xC0DFkikVvd5MfwoxjLqE=
|
||||
google.golang.org/genproto v0.0.0-20241118233622-e639e219e697 h1:ToEetK57OidYuqD4Q5w+vfEnPvPpuTwedCNVohYJfNk=
|
||||
google.golang.org/genproto v0.0.0-20241118233622-e639e219e697/go.mod h1:JJrvXBWRZaFMxBufik1a4RpFw4HhgVtBBWQeQgUj2cc=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20241118233622-e639e219e697 h1:pgr/4QbFyktUv9CtQ/Fq4gzEE6/Xs7iCXbktaGzLHbQ=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20241118233622-e639e219e697/go.mod h1:+D9ySVjN8nY8YCVjc5O7PZDIdZporIDY3KaGfJunh88=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576 h1:8ZmaLZE4XWrtU3MyClkYqqtl6Oegr3235h7jxsDyqCY=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576/go.mod h1:5uTbfoYQed2U9p3KIj2/Zzm02PYhndfdmML0qC3q3FU=
|
||||
google.golang.org/grpc v1.67.3 h1:OgPcDAFKHnH8X3O4WcO4XUc8GRDeKsKReqbQtiCj7N8=
|
||||
google.golang.org/grpc v1.67.3/go.mod h1:YGaHCc6Oap+FzBJTZLBzkGSYt/cvGPFTPxkn7QfSU8s=
|
||||
google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg=
|
||||
google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
|
||||
google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io=
|
||||
google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||
|
|
40
main.go
40
main.go
|
@ -5,6 +5,7 @@ import (
|
|||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"paperless-gpt/ocr"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
|
@ -113,10 +114,11 @@ Document Content:
|
|||
|
||||
// App struct to hold dependencies
|
||||
type App struct {
|
||||
Client *PaperlessClient
|
||||
Database *gorm.DB
|
||||
LLM llms.Model
|
||||
VisionLLM llms.Model
|
||||
Client *PaperlessClient
|
||||
Database *gorm.DB
|
||||
LLM llms.Model
|
||||
VisionLLM llms.Model
|
||||
ocrProvider ocr.Provider // OCR provider interface
|
||||
}
|
||||
|
||||
func main() {
|
||||
|
@ -150,12 +152,34 @@ func main() {
|
|||
log.Fatalf("Failed to create Vision LLM client: %v", err)
|
||||
}
|
||||
|
||||
// Initialize OCR provider
|
||||
var ocrProvider ocr.Provider
|
||||
providerType := os.Getenv("OCR_PROVIDER")
|
||||
if providerType == "" {
|
||||
providerType = "llm" // Default to LLM provider
|
||||
}
|
||||
|
||||
ocrConfig := ocr.Config{
|
||||
Provider: providerType,
|
||||
GoogleProjectID: os.Getenv("GOOGLE_PROJECT_ID"),
|
||||
GoogleLocation: os.Getenv("GOOGLE_LOCATION"),
|
||||
GoogleProcessorID: os.Getenv("GOOGLE_PROCESSOR_ID"),
|
||||
VisionLLMProvider: visionLlmProvider,
|
||||
VisionLLMModel: visionLlmModel,
|
||||
}
|
||||
|
||||
ocrProvider, err = ocr.NewProvider(ocrConfig)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to initialize OCR provider: %v", err)
|
||||
}
|
||||
|
||||
// Initialize App with dependencies
|
||||
app := &App{
|
||||
Client: client,
|
||||
Database: database,
|
||||
LLM: llm,
|
||||
VisionLLM: visionLlm,
|
||||
Client: client,
|
||||
Database: database,
|
||||
LLM: llm,
|
||||
VisionLLM: visionLlm,
|
||||
ocrProvider: ocrProvider,
|
||||
}
|
||||
|
||||
// Start background process for auto-tagging
|
||||
|
|
2
ocr.go
2
ocr.go
|
@ -36,7 +36,7 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string,
|
|||
return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
|
||||
}
|
||||
|
||||
ocrText, err := app.doOCRViaLLM(ctx, imageContent, pageLogger)
|
||||
ocrText, err := app.ocrProvider.ProcessImage(ctx, imageContent)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
|
||||
}
|
||||
|
|
89
ocr/google_docai_provider.go
Normal file
89
ocr/google_docai_provider.go
Normal file
|
@ -0,0 +1,89 @@
|
|||
package ocr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
documentai "cloud.google.com/go/documentai/apiv1"
|
||||
"cloud.google.com/go/documentai/apiv1/documentaipb"
|
||||
"github.com/gabriel-vasile/mimetype"
|
||||
"google.golang.org/api/option"
|
||||
)
|
||||
|
||||
// GoogleDocAIProvider implements OCR using Google Document AI
|
||||
type GoogleDocAIProvider struct {
|
||||
projectID string
|
||||
location string
|
||||
processorID string
|
||||
client *documentai.DocumentProcessorClient
|
||||
}
|
||||
|
||||
func newGoogleDocAIProvider(config Config) (*GoogleDocAIProvider, error) {
|
||||
ctx := context.Background()
|
||||
endpoint := fmt.Sprintf("%s-documentai.googleapis.com:443", config.GoogleLocation)
|
||||
|
||||
client, err := documentai.NewDocumentProcessorClient(ctx, option.WithEndpoint(endpoint))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error creating Document AI client: %w", err)
|
||||
}
|
||||
|
||||
return &GoogleDocAIProvider{
|
||||
projectID: config.GoogleProjectID,
|
||||
location: config.GoogleLocation,
|
||||
processorID: config.GoogleProcessorID,
|
||||
client: client,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p *GoogleDocAIProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
||||
// Detect MIME type
|
||||
mtype := mimetype.Detect(imageContent)
|
||||
if !isImageMIMEType(mtype.String()) {
|
||||
return "", fmt.Errorf("unsupported file type: %s", mtype.String())
|
||||
}
|
||||
|
||||
name := fmt.Sprintf("projects/%s/locations/%s/processors/%s", p.projectID, p.location, p.processorID)
|
||||
|
||||
req := &documentaipb.ProcessRequest{
|
||||
Name: name,
|
||||
Source: &documentaipb.ProcessRequest_RawDocument{
|
||||
RawDocument: &documentaipb.RawDocument{
|
||||
Content: imageContent,
|
||||
MimeType: mtype.String(),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
resp, err := p.client.ProcessDocument(ctx, req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error processing document: %w", err)
|
||||
}
|
||||
|
||||
if resp == nil || resp.Document == nil {
|
||||
return "", fmt.Errorf("received nil response or document from Document AI")
|
||||
}
|
||||
|
||||
if resp.Document.Error != nil {
|
||||
return "", fmt.Errorf("document processing error: %s", resp.Document.Error.Message)
|
||||
}
|
||||
|
||||
return resp.Document.Text, nil
|
||||
}
|
||||
|
||||
// isImageMIMEType checks if the given MIME type is a supported image type
|
||||
func isImageMIMEType(mimeType string) bool {
|
||||
switch mimeType {
|
||||
case "image/jpeg", "image/jpg", "image/png", "image/tiff", "image/bmp", "application/pdf":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Close releases resources used by the provider
|
||||
func (p *GoogleDocAIProvider) Close() error {
|
||||
if p.client != nil {
|
||||
return p.client.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
111
ocr/llm_provider.go
Normal file
111
ocr/llm_provider.go
Normal file
|
@ -0,0 +1,111 @@
|
|||
package ocr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"image"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
_ "image/jpeg"
|
||||
|
||||
"github.com/tmc/langchaingo/llms"
|
||||
"github.com/tmc/langchaingo/llms/ollama"
|
||||
"github.com/tmc/langchaingo/llms/openai"
|
||||
)
|
||||
|
||||
// LLMProvider implements OCR using LLM vision models
|
||||
type LLMProvider struct {
|
||||
provider string
|
||||
model string
|
||||
llm llms.Model
|
||||
template string // OCR prompt template
|
||||
}
|
||||
|
||||
func newLLMProvider(config Config) (*LLMProvider, error) {
|
||||
var model llms.Model
|
||||
var err error
|
||||
|
||||
switch strings.ToLower(config.VisionLLMProvider) {
|
||||
case "openai":
|
||||
model, err = createOpenAIClient(config)
|
||||
case "ollama":
|
||||
model, err = createOllamaClient(config)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported vision LLM provider: %s", config.VisionLLMProvider)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error creating vision LLM client: %w", err)
|
||||
}
|
||||
|
||||
return &LLMProvider{
|
||||
provider: config.VisionLLMProvider,
|
||||
model: config.VisionLLMModel,
|
||||
llm: model,
|
||||
template: defaultOCRPrompt,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// createOpenAIClient creates a new OpenAI vision model client
|
||||
func createOpenAIClient(config Config) (llms.Model, error) {
|
||||
if os.Getenv("OPENAI_API_KEY") == "" {
|
||||
return nil, fmt.Errorf("OpenAI API key is not set")
|
||||
}
|
||||
return openai.New(
|
||||
openai.WithModel(config.VisionLLMModel),
|
||||
openai.WithToken(os.Getenv("OPENAI_API_KEY")),
|
||||
)
|
||||
}
|
||||
|
||||
// createOllamaClient creates a new Ollama vision model client
|
||||
func createOllamaClient(config Config) (llms.Model, error) {
|
||||
host := os.Getenv("OLLAMA_HOST")
|
||||
if host == "" {
|
||||
host = "http://127.0.0.1:11434"
|
||||
}
|
||||
return ollama.New(
|
||||
ollama.WithModel(config.VisionLLMModel),
|
||||
ollama.WithServerURL(host),
|
||||
)
|
||||
}
|
||||
|
||||
func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte) (string, error) {
|
||||
// Decode image to validate format and get dimensions for logging
|
||||
_, _, err := image.Decode(bytes.NewReader(imageContent))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error decoding image: %w", err)
|
||||
}
|
||||
|
||||
// Prepare content parts based on provider type
|
||||
var parts []llms.ContentPart
|
||||
if strings.ToLower(p.provider) != "openai" {
|
||||
parts = []llms.ContentPart{
|
||||
llms.BinaryPart("image/jpeg", imageContent),
|
||||
llms.TextPart(p.template),
|
||||
}
|
||||
} else {
|
||||
base64Image := base64.StdEncoding.EncodeToString(imageContent)
|
||||
parts = []llms.ContentPart{
|
||||
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
|
||||
llms.TextPart(p.template),
|
||||
}
|
||||
}
|
||||
|
||||
// Convert the image to text
|
||||
completion, err := p.llm.GenerateContent(ctx, []llms.MessageContent{
|
||||
{
|
||||
Parts: parts,
|
||||
Role: llms.ChatMessageTypeHuman,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error getting response from LLM: %w", err)
|
||||
}
|
||||
|
||||
return completion.Choices[0].Content, nil
|
||||
}
|
||||
|
||||
const defaultOCRPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format but without a code block.`
|
44
ocr/provider.go
Normal file
44
ocr/provider.go
Normal file
|
@ -0,0 +1,44 @@
|
|||
package ocr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// Provider defines the interface for OCR processing
|
||||
type Provider interface {
|
||||
ProcessImage(ctx context.Context, imageContent []byte) (string, error)
|
||||
}
|
||||
|
||||
// Config holds the OCR provider configuration
|
||||
type Config struct {
|
||||
// Provider type (e.g., "llm", "google_docai")
|
||||
Provider string
|
||||
|
||||
// Google Document AI settings
|
||||
GoogleProjectID string
|
||||
GoogleLocation string
|
||||
GoogleProcessorID string
|
||||
|
||||
// LLM settings (from existing config)
|
||||
VisionLLMProvider string
|
||||
VisionLLMModel string
|
||||
}
|
||||
|
||||
// NewProvider creates a new OCR provider based on configuration
|
||||
func NewProvider(config Config) (Provider, error) {
|
||||
switch config.Provider {
|
||||
case "google_docai":
|
||||
if config.GoogleProjectID == "" || config.GoogleLocation == "" || config.GoogleProcessorID == "" {
|
||||
return nil, fmt.Errorf("missing required Google Document AI configuration")
|
||||
}
|
||||
return newGoogleDocAIProvider(config)
|
||||
case "llm":
|
||||
if config.VisionLLMProvider == "" || config.VisionLLMModel == "" {
|
||||
return nil, fmt.Errorf("missing required LLM configuration")
|
||||
}
|
||||
return newLLMProvider(config)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported OCR provider: %s", config.Provider)
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue