Merge branch 'main' into correspondents

This commit is contained in:
Dominik Schröter 2025-01-13 14:23:39 +01:00
commit 515b78b6a1
20 changed files with 626 additions and 331 deletions

View file

@ -96,3 +96,7 @@ jobs:
cache-from: type=gha
cache-to: type=gha,mode=max
tags: ${{ env.TAGS }}
build-args: |
VERSION=${{ github.ref_type == 'tag' && github.ref_name || github.sha }}
COMMIT=${{ github.sha }}
BUILD_DATE=${{ github.event.repository.pushed_at }}

View file

@ -1,17 +1,33 @@
# Define top-level build arguments
ARG VERSION=docker-dev
ARG COMMIT=unknown
ARG BUILD_DATE=unknown
# Stage 1: Build the Go binary
FROM golang:1.22-alpine AS builder
FROM golang:1.23.4-alpine3.21 AS builder
# Set the working directory inside the container
WORKDIR /app
# Install necessary packages
RUN apk add --no-cache \
git \
gcc \
musl-dev \
mupdf \
mupdf-dev
# Package versions for Renovate
# renovate: datasource=repology depName=alpine_3_21/gcc versioning=loose
ENV GCC_VERSION=14.2.0-r4
# renovate: datasource=repology depName=alpine_3_21/musl-dev versioning=loose
ENV MUSL_DEV_VERSION=1.2.5-r8
# renovate: datasource=repology depName=alpine_3_21/mupdf versioning=loose
ENV MUPDF_VERSION=1.24.10-r0
# renovate: datasource=repology depName=alpine_3_21/mupdf-dev versioning=loose
ENV MUPDF_DEV_VERSION=1.24.10-r0
# renovate: datasource=repology depName=alpine_3_21/sed versioning=loose
ENV SED_VERSION=4.9-r2
# Install necessary packages with pinned versions
RUN apk add --no-cache \
"gcc=${GCC_VERSION}" \
"musl-dev=${MUSL_DEV_VERSION}" \
"mupdf=${MUPDF_VERSION}" \
"mupdf-dev=${MUPDF_DEV_VERSION}" \
"sed=${SED_VERSION}"
# Copy go.mod and go.sum files
COPY go.mod go.sum ./
@ -24,6 +40,18 @@ RUN CGO_ENABLED=1 go build -tags musl -o /dev/null github.com/mattn/go-sqlite3
# Now copy the actual source files
COPY *.go .
# Import ARGs from top level
ARG VERSION
ARG COMMIT
ARG BUILD_DATE
# Update version information
RUN sed -i \
-e "s/devVersion/${VERSION}/" \
-e "s/devBuildDate/${BUILD_DATE}/" \
-e "s/devCommit/${COMMIT}/" \
version.go
# Build the binary using caching for both go modules and build cache
RUN CGO_ENABLED=1 GOMAXPROCS=$(nproc) go build -tags musl -o paperless-gpt .
@ -51,6 +79,8 @@ RUN npm run build
# Stage 3: Create a lightweight image with the Go binary and frontend
FROM alpine:latest
ENV GIN_MODE=release
# Install necessary runtime dependencies
RUN apk add --no-cache \
ca-certificates

559
README.md
View file

@ -1,122 +1,129 @@
# paperless-gpt
[![License](https://img.shields.io/github/license/icereed/paperless-gpt)](LICENSE)
[![Docker Pulls](https://img.shields.io/docker/pulls/icereed/paperless-gpt)](https://hub.docker.com/r/icereed/paperless-gpt)
[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](CODE_OF_CONDUCT.md)
![Screenshot](./paperless-gpt-screenshot.png)
**paperless-gpt** is a tool designed to generate accurate and meaningful document titles and tags for [paperless-ngx](https://github.com/paperless-ngx/paperless-ngx) using Large Language Models (LLMs). It supports multiple LLM providers, including **OpenAI** and **Ollama**. With paperless-gpt, you can streamline your document management by automatically suggesting appropriate titles and tags based on the content of your scanned documents.
**paperless-gpt** seamlessly pairs with [paperless-ngx][paperless-ngx] to generate **AI-powered document titles** and **tags**, saving you hours of manual sorting. While other tools may offer AI chat features, **paperless-gpt** stands out by **supercharging OCR with LLMs**—ensuring high accuracy, even with tricky scans. If youre craving next-level text extraction and effortless document organization, this is your solution.
[![Demo](./demo.gif)](./demo.gif)
https://github.com/user-attachments/assets/bd5d38b9-9309-40b9-93ca-918dfa4f3fd4
## Features
---
- **Multiple LLM Support**: Choose between OpenAI and Ollama for generating document titles and tags.
- **Customizable Prompts**: Modify the prompt templates to suit your specific needs.
- **Easy Integration**: Works seamlessly with your existing paperless-ngx setup.
- **User-Friendly Interface**: Intuitive web interface for reviewing and applying suggested titles and tags.
- **Dockerized Deployment**: Simple setup using Docker and Docker Compose.
- **Automatic Document Processing**: Automatically apply generated suggestions for documents with the `paperless-gpt-auto` tag.
- **Experimental OCR Feature**: Send documents to a vision LLM for OCR processing.
## Key Highlights
1. **LLM-Enhanced OCR**
Harness Large Language Models (OpenAI or Ollama) for **better-than-traditional** OCR—turn messy or low-quality scans into context-aware, high-fidelity text.
2. **Automatic Title & Tag Generation**
No more guesswork. Let the AI do the naming and categorizing. You can easily review suggestions and refine them if needed.
3. **Extensive Customization**
- **Prompt Templates**: Tweak your AI prompts to reflect your domain, style, or preference.
- **Tagging**: Decide how documents get tagged—manually, automatically, or via OCR-based flows.
4. **Simple Docker Deployment**
A few environment variables, and youre off! Compose it alongside paperless-ngx with minimal fuss.
5. **Unified Web UI**
- **Manual Review**: Approve or tweak AIs suggestions.
- **Auto Processing**: Focus only on edge cases while the rest is sorted for you.
6. **Opt-In LLM-based OCR**
If you opt in, your images get read by a Vision LLM, pushing boundaries beyond standard OCR tools.
---
## Table of Contents
- [Key Highlights](#key-highlights)
- [Getting Started](#getting-started)
- [Prerequisites](#prerequisites)
- [Installation](#installation)
- [Docker Compose](#docker-compose)
- [Manual Setup](#manual-setup)
- [Configuration](#configuration)
- [Environment Variables](#environment-variables)
- [Custom Prompt Templates](#custom-prompt-templates)
- [Prompt Templates Directory](#prompt-templates-directory)
- [Mounting the Prompts Directory](#mounting-the-prompts-directory)
- [Editing the Prompt Templates](#editing-the-prompt-templates)
- [Template Syntax and Variables](#template-syntax-and-variables)
- [OCR using AI](#llm-based-ocr-compare-for-yourself)
- [Usage](#usage)
- [Contributing](#contributing)
- [License](#license)
- [Star History](#star-history)
- [Disclaimer](#disclaimer)
- [paperless-gpt](#paperless-gpt)
- [Features](#features)
- [Table of Contents](#table-of-contents)
- [Getting Started](#getting-started)
- [Prerequisites](#prerequisites)
- [Installation](#installation)
- [Docker Compose](#docker-compose)
- [Manual Setup](#manual-setup)
- [Configuration](#configuration)
- [Environment Variables](#environment-variables)
- [Custom Prompt Templates](#custom-prompt-templates)
- [Prompt Templates Directory](#prompt-templates-directory)
- [Mounting the Prompts Directory](#mounting-the-prompts-directory)
- [Editing the Prompt Templates](#editing-the-prompt-templates)
- [Template Syntax and Variables](#template-syntax-and-variables)
- [Usage](#usage)
- [Contributing](#contributing)
- [License](#license)
- [Star History](#star-history)
---
## Getting Started
### Prerequisites
- [Docker](https://www.docker.com/get-started) installed on your system.
- A running instance of [paperless-ngx](https://github.com/paperless-ngx/paperless-ngx).
- [Docker][docker-install] installed.
- A running instance of [paperless-ngx][paperless-ngx].
- Access to an LLM provider:
- **OpenAI**: An API key with access to models like `gpt-4o` or `gpt-3.5-turbo`.
- **Ollama**: A running Ollama server with models like `llama2` installed.
- **OpenAI**: An API key with models like `gpt-4o` or `gpt-3.5-turbo`.
- **Ollama**: A running Ollama server with models like `llama2`.
### Installation
#### Docker Compose
The easiest way to get started is by using Docker Compose. Below is an example `docker-compose.yml` file to set up paperless-gpt alongside paperless-ngx.
Heres an example `docker-compose.yml` to spin up **paperless-gpt** alongside paperless-ngx:
```yaml
version: "3.7"
services:
paperless-ngx:
image: ghcr.io/paperless-ngx/paperless-ngx:latest
# ... (your existing paperless-ngx configuration)
# ... (your existing paperless-ngx config)
paperless-gpt:
image: icereed/paperless-gpt:latest
environment:
PAPERLESS_BASE_URL: "http://paperless-ngx:8000"
PAPERLESS_API_TOKEN: "your_paperless_api_token"
PAPERLESS_PUBLIC_URL: "http://paperless.mydomain.com" # Optional, your public link to access Paperless
MANUAL_TAG: "paperless-gpt" # Optional, default is 'paperless-gpt'
AUTO_TAG: "paperless-gpt-auto" # Optional, default is 'paperless-gpt-auto'
LLM_PROVIDER: "openai" # or 'ollama'
LLM_MODEL: "gpt-4o" # or 'llama2'
OPENAI_API_KEY: "your_openai_api_key" # Required if using OpenAI
LLM_LANGUAGE: "English" # Optional, default is 'English'
OLLAMA_HOST: "http://host.docker.internal:11434" # If using Ollama
VISION_LLM_PROVIDER: "ollama" # Optional (for OCR) - ollama or openai
VISION_LLM_MODEL: "minicpm-v" # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai
AUTO_OCR_TAG: "paperless-gpt-ocr-auto" # Optional, default is 'paperless-gpt-ocr-auto'
LOG_LEVEL: "info" # Optional or 'debug', 'warn', 'error'
PAPERLESS_BASE_URL: 'http://paperless-ngx:8000'
PAPERLESS_API_TOKEN: 'your_paperless_api_token'
PAPERLESS_PUBLIC_URL: 'http://paperless.mydomain.com' # Optional
MANUAL_TAG: 'paperless-gpt' # Optional, default: paperless-gpt
AUTO_TAG: 'paperless-gpt-auto' # Optional, default: paperless-gpt-auto
LLM_PROVIDER: 'openai' # or 'ollama'
LLM_MODEL: 'gpt-4o' # or 'llama2'
OPENAI_API_KEY: 'your_openai_api_key'
# Optional - OPENAI_BASE_URL: 'https://litellm.yourinstallationof.it.com/v1'
LLM_LANGUAGE: 'English' # Optional, default: English
OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama
VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama
VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc.
AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto
OCR_LIMIT_PAGES: '5' # Optional, default: 5. Set to 0 for no limit.
LOG_LEVEL: 'info' # Optional: debug, warn, error
volumes:
- ./prompts:/app/prompts # Mount the prompts directory
- ./prompts:/app/prompts # Mount the prompts directory
ports:
- "8080:8080"
depends_on:
- paperless-ngx
```
**Note:** Replace the placeholder values with your actual configuration.
**Pro Tip**: Replace placeholders with real values and read the logs if something looks off.
#### Manual Setup
If you prefer to run the application manually:
1. **Clone the Repository:**
1. **Clone the Repository**
```bash
git clone https://github.com/icereed/paperless-gpt.git
cd paperless-gpt
```
2. **Create a `prompts` Directory:**
2. **Create a `prompts` Directory**
```bash
mkdir prompts
```
3. **Build the Docker Image:**
3. **Build the Docker Image**
```bash
docker build -t paperless-gpt .
```
4. **Run the Container:**
4. **Run the Container**
```bash
docker run -d \
-e PAPERLESS_BASE_URL='http://your_paperless_ngx_url' \
@ -128,201 +135,313 @@ If you prefer to run the application manually:
-e VISION_LLM_PROVIDER='ollama' \
-e VISION_LLM_MODEL='minicpm-v' \
-e LOG_LEVEL='info' \
-v $(pwd)/prompts:/app/prompts \ # Mount the prompts directory
-v $(pwd)/prompts:/app/prompts \
-p 8080:8080 \
paperless-gpt
```
---
## Configuration
### Environment Variables
| Variable | Description | Required |
| -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | -------- |
| `PAPERLESS_BASE_URL` | The base URL of your paperless-ngx instance (e.g., `http://paperless-ngx:8000`). | Yes |
| `PAPERLESS_API_TOKEN` | API token for accessing paperless-ngx. You can generate one in the paperless-ngx admin interface. | Yes |
| `PAPERLESS_PUBLIC_URL` | The public URL for your Paperless instance, if it is different to your `PAPERLESS_BASE_URL` - say if you are running in Docker Compose | No |
| `MANUAL_TAG` | The tag to use for manually processing documents. Default is `paperless-gpt`. | No |
| `AUTO_TAG` | The tag to use for automatically processing documents. Default is `paperless-gpt-auto`. | No |
| `LLM_PROVIDER` | The LLM provider to use (`openai` or `ollama`). | Yes |
| `LLM_MODEL` | The model name to use (e.g., `gpt-4o`, `gpt-3.5-turbo`, `llama2`). | Yes |
| `OPENAI_API_KEY` | Your OpenAI API key. Required if using OpenAI as the LLM provider. | Cond. |
| `LLM_LANGUAGE` | The likely language of your documents (e.g., `English`, `German`). Default is `English`. | No |
| `OLLAMA_HOST` | The URL of the Ollama server (e.g., `http://host.docker.internal:11434`). Useful if using Ollama. Default is `http://127.0.0.1:11434`. | No |
| `VISION_LLM_PROVIDER` | The vision LLM provider to use for OCR (`openai` or `ollama`). | No |
| `VISION_LLM_MODEL` | The model name to use for OCR (e.g., `minicpm-v`). | No |
| `AUTO_OCR_TAG` | The tag to use for automatically processing documents with OCR. Default is `paperless-gpt-ocr-auto`. | No |
| `LOG_LEVEL` | The log level for the application (`info`, `debug`, `warn`, `error`). Default is `info`. | No |
| `LISTEN_INTERFACE` | The interface paperless-gpt listens to. Default is `:8080` | No |
| `WEBUI_PATH` | The path to load static content from. Default is `./web-app/dist` | No |
| `AUTO_GENERATE_TITLE` | Enable/disable title generation when automatically applying suggestions with `paperless-gpt-auto`. Default is `true` | No |
| `AUTO_GENERATE_TAGS` | Enable/disable tag generation when automatically applying suggestions with `paperless-gpt-auto`. Default is `true` | No |
| `CORRESPONDENT_BLACK_LIST` | A comma-separated list of names to exclude from the correspondents suggestions. Example: `John Doe, Jane Smith`. | No |
**Note:** When using Ollama, ensure that the Ollama server is running and accessible from the paperless-gpt container.
=======
| Variable | Description | Required |
|------------------------|------------------------------------------------------------------------------------------------------------------|----------|
| `PAPERLESS_BASE_URL` | URL of your paperless-ngx instance (e.g. `http://paperless-ngx:8000`). | Yes |
| `PAPERLESS_API_TOKEN` | API token for paperless-ngx. Generate one in paperless-ngx admin. | Yes |
| `PAPERLESS_PUBLIC_URL` | Public URL for Paperless (if different from `PAPERLESS_BASE_URL`). | No |
| `MANUAL_TAG` | Tag for manual processing. Default: `paperless-gpt`. | No |
| `AUTO_TAG` | Tag for auto processing. Default: `paperless-gpt-auto`. | No |
| `LLM_PROVIDER` | AI backend (`openai` or `ollama`). | Yes |
| `LLM_MODEL` | AI model name, e.g. `gpt-4o`, `gpt-3.5-turbo`, `llama2`. | Yes |
| `OPENAI_API_KEY` | OpenAI API key (required if using OpenAI). | Cond. |
| `OPENAI_BASE_URL` | OpenAI base URL (optional, if using a custom OpenAI compatible service like LiteLLM). | No |
| `LLM_LANGUAGE` | Likely language for documents (e.g. `English`). Default: `English`. | No |
| `OLLAMA_HOST` | Ollama server URL (e.g. `http://host.docker.internal:11434`). | No |
| `VISION_LLM_PROVIDER` | AI backend for OCR (`openai` or `ollama`). | No |
| `VISION_LLM_MODEL` | Model name for OCR (e.g. `minicpm-v`). | No |
| `AUTO_OCR_TAG` | Tag for automatically processing docs with OCR. Default: `paperless-gpt-ocr-auto`. | No |
| `LOG_LEVEL` | Application log level (`info`, `debug`, `warn`, `error`). Default: `info`. | No |
| `LISTEN_INTERFACE` | Network interface to listen on. Default: `:8080`. | No |
| `WEBUI_PATH` | Path for static content. Default: `./web-app/dist`. | No |
| `AUTO_GENERATE_TITLE` | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
| `AUTO_GENERATE_TAGS` | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
| `AUTO_GENERATE_CORRESPONDENTS` | Generate correspondents automatically if `paperless-gpt-auto` is used. Default: `true`. | No |
| `OCR_LIMIT_PAGES` | Limit the number of pages for OCR. Set to `0` for no limit. Default: `5`. | No |
| `CORRESPONDENT_BLACK_LIST` | A comma-separated list of names to exclude from the correspondents suggestions. Example: `John Doe, Jane Smith`.
### Custom Prompt Templates
You can customize the prompt templates used by paperless-gpt to generate titles and tags. By default, the application uses built-in templates, but you can modify them by editing the template files.
paperless-gpts flexible **prompt templates** let you shape how AI responds:
#### Prompt Templates Directory
1. **`title_prompt.tmpl`**: For document titles.
2. **`tag_prompt.tmpl`**: For tagging logic.
3. **`ocr_prompt.tmpl`**: For LLM OCR.
The prompt templates are stored in the `prompts` directory inside the application. The two main template files are:
- `title_prompt.tmpl`: Template used for generating document titles.
- `tag_prompt.tmpl`: Template used for generating document tags.
#### Mounting the Prompts Directory
To modify the prompt templates, you need to mount a local `prompts` directory into the container.
**Docker Compose Example:**
Mount them into your container via:
```yaml
services:
paperless-gpt:
image: icereed/paperless-gpt:latest
# ... (other configurations)
volumes:
- ./prompts:/app/prompts # Mount the prompts directory
volumes:
- ./prompts:/app/prompts
```
**Docker Run Command Example:**
Then tweak at will—**paperless-gpt** reloads them automatically on startup!
```bash
docker run -d \
# ... (other configurations)
-v $(pwd)/prompts:/app/prompts \
paperless-gpt
```
#### Editing the Prompt Templates
1. **Start the Container:**
When you first start the container with the `prompts` directory mounted, it will automatically create the default template files in your local `prompts` directory if they do not exist.
2. **Edit the Template Files:**
- Open `prompts/title_prompt.tmpl` and `prompts/tag_prompt.tmpl` with your favorite text editor.
- Modify the templates using Go's `text/template` syntax.
- Save the changes.
3. **Restart the Container (if necessary):**
The application automatically reloads the templates when it starts. If the container is already running, you may need to restart it to apply the changes.
#### Template Syntax and Variables
The templates use Go's `text/template` syntax and have access to the following variables:
- **For `title_prompt.tmpl`:**
- `{{.Language}}`: The language specified in `LLM_LANGUAGE` (default is `English`).
- `{{.Content}}`: The content of the document.
- **For `tag_prompt.tmpl`:**
- `{{.Language}}`: The language specified in `LLM_LANGUAGE`.
- `{{.AvailableTags}}`: A list (array) of available tags from paperless-ngx.
- `{{.Title}}`: The suggested title for the document.
- `{{.Content}}`: The content of the document.
**Example `title_prompt.tmpl`:**
```text
I will provide you with the content of a document that has been partially read by OCR (so it may contain errors).
Your task is to find a suitable document title that I can use as the title in the paperless-ngx program.
Respond only with the title, without any additional information. The content is likely in {{.Language}}.
Be sure to add one fitting emoji at the beginning of the title to make it more visually appealing.
Content:
{{.Content}}
```
**Example `tag_prompt.tmpl`:**
```text
I will provide you with the content and the title of a document. Your task is to select appropriate tags for the document from the list of available tags I will provide. Only select tags from the provided list. Respond only with the selected tags as a comma-separated list, without any additional information. The content is likely in {{.Language}}.
Available Tags:
{{.AvailableTags | join ","}}
Title:
{{.Title}}
Content:
{{.Content}}
Please concisely select the {{.Language}} tags from the list above that best describe the document.
Be very selective and only choose the most relevant tags since too many tags will make the document less discoverable.
```
**Note:** Advanced users can utilize additional functions from the [Sprig](http://masterminds.github.io/sprig/) template library, as it is included in the application.
---
## Usage
1. **Tag Documents in paperless-ngx:**
1. **Tag Documents**
- Add `paperless-gpt` or your custom tag to the docs you want to AI-ify.
- Add the tag `paperless-gpt` to documents you want to process. This tag is configurable via the `tagToFilter` variable in the code (default is `paperless-gpt`).
2. **Visit Web UI**
- Go to `http://localhost:8080` (or your host) in your browser.
2. **Access the paperless-gpt Interface:**
3. **Generate & Apply Suggestions**
- Click “Generate Suggestions” to see AI-proposed titles/tags.
- Approve, edit, or discard. Hit “Apply” to finalize in paperless-ngx.
- Open your browser and navigate to `http://localhost:8080`.
4. **Try LLM-Based OCR (Experimental)**
- If you enabled `VISION_LLM_PROVIDER` and `VISION_LLM_MODEL`, let AI-based OCR read your scanned PDFs.
- Tag those documents with `paperless-gpt-ocr-auto` (or your custom `AUTO_OCR_TAG`).
3. **Process Documents:**
**Tip**: The entire pipeline can be **fully automated** if you prefer minimal manual intervention.
- Click on **"Generate Suggestions"** to let the LLM generate title suggestions based on the document content.
---
4. **Review and Apply Titles and Tags:**
## LLM-Based OCR: Compare for Yourself
- Review the suggested titles. You can edit them if necessary.
- Click on **"Apply Suggestions"** to update the document titles in paperless-ngx.
<details>
<summary>Click to expand the vanilla OCR vs. AI-powered OCR comparison</summary>
5. **Experimental OCR Feature:**
### Example 1
- Send documents to a vision LLM for OCR processing.
- Example configuration to enable OCR with Ollama:
```env
VISION_LLM_PROVIDER=ollama
VISION_LLM_MODEL=minicpm-v
```
**Image**:
![Image](demo/ocr-example1.jpg)
**Vanilla Paperless-ngx OCR**:
```
La Grande Recre
Gentre Gommercial 1'Esplanade
1349 LOLNAIN LA NEWWE
TA BERBOGAAL Tel =. 010 45,96 12
Ticket 1440112 03/11/2006 a 13597:
4007176614518. DINOS. TYRAMNESA
TOTAET.T.LES
ReslE par Lask-Euron
Rencu en Cash Euro
V.14.6 -Hotgese = VALERTE
TICKET A-GONGERVER PORR TONT. EEHANGE
HERET ET A BIENTOT
```
**LLM-Powered OCR (OpenAI gpt-4o)**:
```
La Grande Récré
Centre Commercial l'Esplanade
1348 LOUVAIN LA NEUVE
TVA 860826401 Tel : 010 45 95 12
Ticket 14421 le 03/11/2006 à 15:27:18
4007176614518 DINOS TYRANNOSA 14.90
TOTAL T.T.C. 14.90
Réglé par Cash Euro 50.00
Rendu en Cash Euro 35.10
V.14.6 Hôtesse : VALERIE
TICKET A CONSERVER POUR TOUT ECHANGE
MERCI ET A BIENTOT
```
---
### Example 2
**Image**:
![Image](demo/ocr-example2.jpg)
**Vanilla Paperless-ngx OCR**:
```
Invoice Number: 1-996-84199
Fed: Invoica Date: Sep01, 2014
Accaunt Number: 1334-8037-4
Page: 1012
Fod£x Tax ID 71.0427007
IRISINC
SHARON ANDERSON
4731 W ATLANTIC AVE STE BI
DELRAY BEACH FL 33445-3897 a
Invoice Questions?
Bing, Account Shipping Address: Contact FedEx Reı
ISINC
4731 W ATLANTIC AVE Phone: (800) 622-1147 M-F 7-6 (CST)
DELRAY BEACH FL 33445-3897 US Fax: (800) 548-3020
Internet: www.fedex.com
Invoice Summary Sep 01, 2014
FodEx Ground Services
Other Charges 11.00
Total Charges 11.00 Da £
>
polo) Fz// /G
TOTAL THIS INVOICE .... usps 11.00 P 2/1 f
The only charges accrued for this period is the Weekly Service Charge.
The Fedix Ground aceounts teferencedin his involce have been transteired and assigned 10, are owned by,andare payable to FedEx Express:
To onsurs propor credit, plasa raturn this portion wirh your payment 10 FodEx
Please do not staple or fold. Ploase make your chack payablı to FedEx.
[TI For change ol address, hc har and camphat lrm or never ide
Remittance Advice
Your payment is due by Sep 16, 2004
Number Number Dus
1334803719968 41993200000110071
AT 01 0391292 468448196 A**aDGT
IRISINC Illallun elalalssollallansdHilalellund
SHARON ANDERSON
4731 W ATLANTIC AVE STEBI FedEx
DELRAY BEACH FL 334453897 PO. Box 94516
PALATINE IL 60094-4515
```
**LLM-Powered OCR (OpenAI gpt-4o)**:
```
FedEx. Invoice Number: 1-996-84199
Invoice Date: Sep 01, 2014
Account Number: 1334-8037-4
Page: 1 of 2
FedEx Tax ID: 71-0427007
I R I S INC
SHARON ANDERSON
4731 W ATLANTIC AVE STE B1
DELRAY BEACH FL 33445-3897
Invoice Questions?
Billing Account Shipping Address: Contact FedEx Revenue Services
I R I S INC Phone: (800) 622-1147 M-F 7-6 (CST)
4731 W ATLANTIC AVE Fax: (800) 548-3020
DELRAY BEACH FL 33445-3897 US Internet: www.fedex.com
Invoice Summary Sep 01, 2014
FedEx Ground Services
Other Charges 11.00
Total Charges .......................................................... USD $ 11.00
TOTAL THIS INVOICE .............................................. USD $ 11.00
The only charges accrued for this period is the Weekly Service Charge.
RECEIVED
SEP _ 8 REC'D
BY: _
posted 9/21/14
The FedEx Ground accounts referenced in this invoice have been transferred and assigned to, are owned by, and are payable to FedEx Express.
To ensure proper credit, please return this portion with your payment to FedEx.
Please do not staple or fold. Please make your check payable to FedEx.
❑ For change of address, check here and complete form on reverse side.
Remittance Advice
Your payment is due by Sep 16, 2004
Invoice
Number
1-996-84199
Account
Number
1334-8037-4
Amount
Due
USD $ 11.00
133480371996841993200000110071
AT 01 031292 468448196 A**3DGT
I R I S INC
SHARON ANDERSON
4731 W ATLANTIC AVE STE B1
DELRAY BEACH FL 33445-3897
FedEx
P.O. Box 94515
```
---
</details>
**Why Does It Matter?**
- Traditional OCR often jumbles text from complex or low-quality scans.
- Large Language Models interpret context and correct likely errors, producing results that are more precise and readable.
- You can integrate these cleaned-up texts into your **paperless-ngx** pipeline for better tagging, searching, and archiving.
### How It Works
- **Vanilla OCR** typically uses classical methods or Tesseract-like engines to extract text, which can result in garbled outputs for complex fonts or poor-quality scans.
- **LLM-Powered OCR** uses your chosen AI backend—OpenAI or Ollama—to interpret the images text in a more context-aware manner. This leads to fewer errors and more coherent text.
---
## Contributing
Contributions are welcome! Please read the [contributing guidelines](CONTRIBUTING.md) before submitting a pull request.
**Pull requests** and **issues** are welcome!
1. Fork the repo
2. Create a branch (`feature/my-awesome-update`)
3. Commit changes (`git commit -m "Improve X"`)
4. Open a PR
1. **Fork the Repository**
Check out our [contributing guidelines](CONTRIBUTING.md) for details.
2. **Create a Feature Branch**
```bash
git checkout -b feature/my-new-feature
```
3. **Commit Your Changes**
```bash
git commit -am 'Add some feature'
```
4. **Push to the Branch**
```bash
git push origin feature/my-new-feature
```
5. **Create a Pull Request**
---
## License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
paperless-gpt is licensed under the [MIT License](LICENSE). Feel free to adapt and share!
---
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=icereed/paperless-gpt&type=Date)](https://star-history.com/#icereed/paperless-gpt&Date)
---
**Disclaimer:** This project is not affiliated with the official paperless-ngx project. Use at your own discretion.
## Disclaimer
This project is **not** officially affiliated with [paperless-ngx][paperless-ngx]. Use at your own risk.
---
**paperless-gpt**: The **LLM-based** companion your doc management has been waiting for. Enjoy effortless, intelligent document titles, tags, and next-level OCR.
[paperless-ngx]: https://github.com/paperless-ngx/paperless-ngx
[docker-install]: https://docs.docker.com/get-docker/

View file

@ -119,7 +119,7 @@ func (app *App) generateSuggestionsHandler(c *gin.Context) {
return
}
results, err := app.generateDocumentSuggestions(ctx, suggestionRequest)
results, err := app.generateDocumentSuggestions(ctx, suggestionRequest, log.WithContext(ctx))
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Sprintf("Error processing documents: %v", err)})
log.Errorf("Error processing documents: %v", err)

View file

@ -5,9 +5,13 @@ import (
"context"
"encoding/base64"
"fmt"
"image"
"strings"
"sync"
_ "image/jpeg"
"github.com/sirupsen/logrus"
"github.com/tmc/langchaingo/llms"
)
@ -52,7 +56,12 @@ func (app *App) getSuggestedCorrespondent(ctx context.Context, content string, s
}
// getSuggestedTags generates suggested tags for a document using the LLM
func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedTitle string, availableTags []string) ([]string, error) {
func (app *App) getSuggestedTags(
ctx context.Context,
content string,
suggestedTitle string,
availableTags []string,
logger *logrus.Entry) ([]string, error) {
likelyLanguage := getLikelyLanguage()
templateMutex.RLock()
@ -66,11 +75,12 @@ func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedT
"Content": content,
})
if err != nil {
logger.Errorf("Error executing tag template: %v", err)
return nil, fmt.Errorf("error executing tag template: %v", err)
}
prompt := promptBuffer.String()
log.Debugf("Tag suggestion prompt: %s", prompt)
logger.Debugf("Tag suggestion prompt: %s", prompt)
completion, err := app.LLM.GenerateContent(ctx, []llms.MessageContent{
{
@ -83,6 +93,7 @@ func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedT
},
})
if err != nil {
logger.Errorf("Error getting response from LLM: %v", err)
return nil, fmt.Errorf("error getting response from LLM: %v", err)
}
@ -106,7 +117,7 @@ func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedT
return filteredTags, nil
}
func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, error) {
func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte, logger *logrus.Entry) (string, error) {
templateMutex.RLock()
defer templateMutex.RUnlock()
@ -122,15 +133,27 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
prompt := promptBuffer.String()
// Log the image dimensions
img, _, err := image.Decode(bytes.NewReader(jpegBytes))
if err != nil {
return "", fmt.Errorf("error decoding image: %v", err)
}
bounds := img.Bounds()
logger.Debugf("Image dimensions: %dx%d", bounds.Dx(), bounds.Dy())
// If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision
var parts []llms.ContentPart
if strings.ToLower(visionLlmProvider) != "openai" {
// Log image size in kilobytes
logger.Debugf("Image size: %d KB", len(jpegBytes)/1024)
parts = []llms.ContentPart{
llms.BinaryPart("image/jpeg", jpegBytes),
llms.TextPart(prompt),
}
} else {
base64Image := base64.StdEncoding.EncodeToString(jpegBytes)
// Log image size in kilobytes
logger.Debugf("Image size: %d KB", len(base64Image)/1024)
parts = []llms.ContentPart{
llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)),
llms.TextPart(prompt),
@ -154,7 +177,7 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro
}
// getSuggestedTitle generates a suggested title for a document using the LLM
func (app *App) getSuggestedTitle(ctx context.Context, content string) (string, error) {
func (app *App) getSuggestedTitle(ctx context.Context, content string, logger *logrus.Entry) (string, error) {
likelyLanguage := getLikelyLanguage()
templateMutex.RLock()
@ -171,7 +194,7 @@ func (app *App) getSuggestedTitle(ctx context.Context, content string) (string,
prompt := promptBuffer.String()
log.Debugf("Title suggestion prompt: %s", prompt)
logger.Debugf("Title suggestion prompt: %s", prompt)
completion, err := app.LLM.GenerateContent(ctx, []llms.MessageContent{
{
@ -191,7 +214,7 @@ func (app *App) getSuggestedTitle(ctx context.Context, content string) (string,
}
// generateDocumentSuggestions generates suggestions for a set of documents
func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionRequest GenerateSuggestionsRequest) ([]DocumentSuggestion, error) {
func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionRequest GenerateSuggestionsRequest, logger *logrus.Entry) ([]DocumentSuggestion, error) {
// Fetch all available tags from paperless-ngx
availableTagsMap, err := app.Client.GetAllTags(ctx)
if err != nil {
@ -231,7 +254,8 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque
go func(doc Document) {
defer wg.Done()
documentID := doc.ID
log.Printf("Processing Document ID %d...", documentID)
docLogger := documentLogger(documentID)
docLogger.Printf("Processing Document ID %d...", documentID)
content := doc.Content
if len(content) > 5000 {
@ -243,23 +267,23 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque
var suggestedCorrespondent string
if suggestionRequest.GenerateTitles {
suggestedTitle, err = app.getSuggestedTitle(ctx, content)
suggestedTitle, err = app.getSuggestedTitle(ctx, content, docLogger)
if err != nil {
mu.Lock()
errorsList = append(errorsList, fmt.Errorf("Document %d: %v", documentID, err))
mu.Unlock()
log.Errorf("Error processing document %d: %v", documentID, err)
docLogger.Errorf("Error processing document %d: %v", documentID, err)
return
}
}
if suggestionRequest.GenerateTags {
suggestedTags, err = app.getSuggestedTags(ctx, content, suggestedTitle, availableTagNames)
suggestedTags, err = app.getSuggestedTags(ctx, content, suggestedTitle, availableTagNames, docLogger)
if err != nil {
mu.Lock()
errorsList = append(errorsList, fmt.Errorf("Document %d: %v", documentID, err))
mu.Unlock()
log.Errorf("Error generating tags for document %d: %v", documentID, err)
logger.Errorf("Error generating tags for document %d: %v", documentID, err)
return
}
}
@ -283,7 +307,7 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque
}
// Titles
if suggestionRequest.GenerateTitles {
log.Printf("Suggested title for document %d: %s", documentID, suggestedTitle)
docLogger.Printf("Suggested title for document %d: %s", documentID, suggestedTitle)
suggestion.SuggestedTitle = suggestedTitle
} else {
suggestion.SuggestedTitle = doc.Title
@ -291,10 +315,10 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque
// Tags
if suggestionRequest.GenerateTags {
log.Printf("Suggested tags for document %d: %v", documentID, suggestedTags)
docLogger.Printf("Suggested tags for document %d: %v", documentID, suggestedTags)
suggestion.SuggestedTags = suggestedTags
} else {
suggestion.SuggestedTags = removeTagFromList(doc.Tags, manualTag)
suggestion.SuggestedTags = doc.Tags
}
// Correspondents
@ -304,10 +328,12 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque
} else {
suggestion.SuggestedCorrespondent = ""
}
// Remove manual tag from the list of suggested tags
suggestion.RemoveTags = []string{manualTag, autoTag}
documentSuggestions = append(documentSuggestions, suggestion)
mu.Unlock()
log.Printf("Document %d processed successfully.", documentID)
docLogger.Printf("Document %d processed successfully.", documentID)
}(documents[i])
}

BIN
demo.gif

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 MiB

BIN
demo.mp4 Normal file

Binary file not shown.

BIN
demo/ocr-example1.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

BIN
demo/ocr-example2.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

6
go.mod
View file

@ -2,7 +2,7 @@ module paperless-gpt
go 1.22.0
toolchain go1.22.2
toolchain go1.23.4
require (
github.com/Masterminds/sprig/v3 v3.3.0
@ -28,6 +28,7 @@ require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dlclark/regexp2 v1.10.0 // indirect
github.com/ebitengine/purego v0.8.0 // indirect
github.com/fatih/color v1.18.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-playground/locales v0.14.1 // indirect
@ -42,6 +43,7 @@ require (
github.com/jupiterrider/ffi v0.2.0 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-sqlite3 v1.14.24 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
@ -58,7 +60,7 @@ require (
golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.26.0 // indirect
golang.org/x/net v0.25.0 // indirect
golang.org/x/sys v0.23.0 // indirect
golang.org/x/sys v0.25.0 // indirect
golang.org/x/text v0.20.0 // indirect
google.golang.org/protobuf v1.34.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect

8
go.sum
View file

@ -25,6 +25,8 @@ github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq
github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
github.com/ebitengine/purego v0.8.0 h1:JbqvnEzRvPpxhCJzJJ2y0RbiZ8nyjccVUrSM3q+GvvE=
github.com/ebitengine/purego v0.8.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
github.com/gen2brain/go-fitz v1.24.14 h1:09weRkjVtLYNGo7l0J7DyOwBExbwi8SJ9h8YPhw9WEo=
@ -70,6 +72,9 @@ github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZY
github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM=
@ -155,6 +160,7 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@ -162,6 +168,8 @@ golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM=
golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc=

124
main.go
View file

@ -6,12 +6,15 @@ import (
"net/http"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"text/template"
"time"
"github.com/Masterminds/sprig/v3"
"github.com/fatih/color"
"github.com/gin-gonic/gin"
"github.com/sirupsen/logrus"
"github.com/tmc/langchaingo/llms"
@ -27,7 +30,6 @@ var (
log = logrus.New()
// Environment Variables
correspondentBlackList = strings.Split(os.Getenv("CORRESPONDENT_BLACK_LIST"), ",")
paperlessBaseURL = os.Getenv("PAPERLESS_BASE_URL")
paperlessAPIToken = os.Getenv("PAPERLESS_API_TOKEN")
openaiAPIKey = os.Getenv("OPENAI_API_KEY")
@ -45,6 +47,7 @@ var (
autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE")
autoGenerateTags = os.Getenv("AUTO_GENERATE_TAGS")
autoGenerateCorrespondents = os.Getenv("AUTO_GENERATE_CORRESPONDENTS")
limitOcrPages int // Will be read from OCR_LIMIT_PAGES
// Templates
titleTemplate *template.Template
@ -121,6 +124,9 @@ func main() {
// Initialize logrus logger
initLogger()
// Print version
printVersion()
// Initialize PaperlessClient
client := NewPaperlessClient(paperlessBaseURL, paperlessAPIToken)
@ -263,6 +269,29 @@ func main() {
}
}
func printVersion() {
cyan := color.New(color.FgCyan).SprintFunc()
yellow := color.New(color.FgYellow).SprintFunc()
banner := `
Paperless GPT
`
fmt.Printf("%s\n", cyan(banner))
fmt.Printf("\n%s %s\n", yellow("Version:"), version)
if commit != "" {
fmt.Printf("%s %s\n", yellow("Commit:"), commit)
}
if buildDate != "" {
fmt.Printf("%s %s\n", yellow("Build Date:"), buildDate)
}
fmt.Printf("%s %s/%s\n", yellow("Platform:"), runtime.GOOS, runtime.GOARCH)
fmt.Printf("%s %s\n", yellow("Go Version:"), runtime.Version())
fmt.Printf("%s %s\n", yellow("Started:"), time.Now().Format(time.RFC1123))
fmt.Println()
}
func initLogger() {
switch logLevel {
case "debug":
@ -338,6 +367,24 @@ func validateOrDefaultEnvVars() {
if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" {
log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.")
}
if isOcrEnabled() {
rawLimitOcrPages := os.Getenv("OCR_LIMIT_PAGES")
if rawLimitOcrPages == "" {
limitOcrPages = 5
} else {
var err error
limitOcrPages, err = strconv.Atoi(rawLimitOcrPages)
if err != nil {
log.Fatalf("Invalid OCR_LIMIT_PAGES value: %v", err)
}
}
}
}
// documentLogger creates a logger with document context
func documentLogger(documentID int) *logrus.Entry {
return log.WithField("document_id", documentID)
}
// processAutoTagDocuments handles the background auto-tagging of documents
@ -356,23 +403,29 @@ func (app *App) processAutoTagDocuments() (int, error) {
log.Debugf("Found at least %d remaining documents with tag %s", len(documents), autoTag)
suggestionRequest := GenerateSuggestionsRequest{
Documents: documents,
GenerateTitles: strings.ToLower(autoGenerateTitle) != "false",
GenerateTags: strings.ToLower(autoGenerateTags) != "false",
GenerateCorrespondents: strings.ToLower(autoGenerateCorrespondents) != "false",
}
for _, document := range documents {
docLogger := documentLogger(document.ID)
docLogger.Info("Processing document for auto-tagging")
suggestions, err := app.generateDocumentSuggestions(ctx, suggestionRequest)
if err != nil {
return 0, fmt.Errorf("error generating suggestions: %w", err)
}
suggestionRequest := GenerateSuggestionsRequest{
Documents: []Document{document},
GenerateTitles: strings.ToLower(autoGenerateTitle) != "false",
GenerateTags: strings.ToLower(autoGenerateTags) != "false",
GenerateCorrespondents: strings.ToLower(autoGenerateCorrespondents) != "false",
}
err = app.Client.UpdateDocuments(ctx, suggestions, app.Database, false)
if err != nil {
return 0, fmt.Errorf("error updating documents: %w", err)
}
suggestions, err := app.generateDocumentSuggestions(ctx, suggestionRequest, docLogger)
if err != nil {
return 0, fmt.Errorf("error generating suggestions for document %d: %w", document.ID, err)
}
err = app.Client.UpdateDocuments(ctx, suggestions, app.Database, false)
if err != nil {
return 0, fmt.Errorf("error updating document %d: %w", document.ID, err)
}
docLogger.Info("Successfully processed document")
}
return len(documents), nil
}
@ -392,26 +445,31 @@ func (app *App) processAutoOcrTagDocuments() (int, error) {
log.Debugf("Found at least %d remaining documents with tag %s", len(documents), autoOcrTag)
documents = documents[:1] // Process only one document at a time
for _, document := range documents {
docLogger := documentLogger(document.ID)
docLogger.Info("Processing document for OCR")
ocrContent, err := app.ProcessDocumentOCR(ctx, documents[0].ID)
if err != nil {
return 0, fmt.Errorf("error processing document OCR: %w", err)
ocrContent, err := app.ProcessDocumentOCR(ctx, document.ID)
if err != nil {
return 0, fmt.Errorf("error processing OCR for document %d: %w", document.ID, err)
}
docLogger.Debug("OCR processing completed")
err = app.Client.UpdateDocuments(ctx, []DocumentSuggestion{
{
ID: document.ID,
OriginalDocument: document,
SuggestedContent: ocrContent,
RemoveTags: []string{autoOcrTag},
},
}, app.Database, false)
if err != nil {
return 0, fmt.Errorf("error updating document %d after OCR: %w", document.ID, err)
}
docLogger.Info("Successfully processed document OCR")
}
log.Debugf("OCR content for document %d: %s", documents[0].ID, ocrContent)
err = app.Client.UpdateDocuments(ctx, []DocumentSuggestion{
{
ID: documents[0].ID,
OriginalDocument: documents[0],
SuggestedContent: ocrContent,
},
}, app.Database, false)
if err != nil {
return 0, fmt.Errorf("error updating documents: %w", err)
}
return 1, nil // Processed one document
return 1, nil
}
// removeTagFromList removes a specific tag from a list of tags

27
ocr.go
View file

@ -9,31 +9,42 @@ import (
// ProcessDocumentOCR processes a document through OCR and returns the combined text
func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) {
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID)
docLogger := documentLogger(documentID)
docLogger.Info("Starting OCR processing")
imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID, limitOcrPages)
defer func() {
for _, imagePath := range imagePaths {
os.Remove(imagePath)
if err := os.Remove(imagePath); err != nil {
docLogger.WithError(err).WithField("image_path", imagePath).Warn("Failed to remove temporary image file")
}
}
}()
if err != nil {
return "", fmt.Errorf("error downloading document images: %w", err)
return "", fmt.Errorf("error downloading document images for document %d: %w", documentID, err)
}
docLogger.WithField("page_count", len(imagePaths)).Debug("Downloaded document images")
var ocrTexts []string
for _, imagePath := range imagePaths {
for i, imagePath := range imagePaths {
pageLogger := docLogger.WithField("page", i+1)
pageLogger.Debug("Processing page")
imageContent, err := os.ReadFile(imagePath)
if err != nil {
return "", fmt.Errorf("error reading image file: %w", err)
return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err)
}
ocrText, err := app.doOCRViaLLM(ctx, imageContent)
ocrText, err := app.doOCRViaLLM(ctx, imageContent, pageLogger)
if err != nil {
return "", fmt.Errorf("error performing OCR: %w", err)
return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err)
}
log.Debugf("OCR text: %s", ocrText)
pageLogger.Debug("OCR completed for page")
ocrTexts = append(ocrTexts, ocrText)
}
docLogger.Info("OCR processing completed successfully")
return strings.Join(ocrTexts, "\n\n"), nil
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 42 KiB

After

Width:  |  Height:  |  Size: 92 KiB

View file

@ -290,8 +290,9 @@ func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents []
}
// remove autoTag to prevent infinite loop (even if it is in the original tags)
originalTags = removeTagFromList(originalTags, autoTag)
originalTags = removeTagFromList(originalTags, autoOcrTag)
for _, tag := range document.RemoveTags {
originalTags = removeTagFromList(originalTags, tag)
}
if len(tags) == 0 {
tags = originalTags
@ -300,6 +301,12 @@ func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents []
originalFields["tags"] = originalTags
// remove autoTag to prevent infinite loop - this is required in case of undo
tags = removeTagFromList(tags, autoTag)
// keep previous tags
tags = append(tags, originalTags...)
// remove duplicates
slices.Sort(tags)
tags = slices.Compact(tags)
}
updatedTagsJSON, err := json.Marshal(tags)
@ -424,7 +431,8 @@ func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents []
}
// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int) ([]string, error) {
// If limitPages > 0, only the first N pages will be processed
func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int, limitPages int) ([]string, error) {
// Create a directory named after the document ID
docDir := filepath.Join(client.GetCacheFolder(), fmt.Sprintf("document-%d", documentId))
if _, err := os.Stat(docDir); os.IsNotExist(err) {
@ -437,6 +445,9 @@ func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, doc
// Check if images already exist
var imagePaths []string
for n := 0; ; n++ {
if limitPages > 0 && n >= limitPages {
break
}
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
if _, err := os.Stat(imagePath); os.IsNotExist(err) {
break
@ -485,10 +496,15 @@ func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, doc
}
defer doc.Close()
totalPages := doc.NumPage()
if limitPages > 0 && limitPages < totalPages {
totalPages = limitPages
}
var mu sync.Mutex
var g errgroup.Group
for n := 0; n < doc.NumPage(); n++ {
for n := 0; n < totalPages; n++ {
n := n // capture loop variable
g.Go(func() error {
mu.Lock()

View file

@ -300,18 +300,24 @@ func TestUpdateDocuments(t *testing.T) {
OriginalDocument: Document{
ID: 1,
Title: "Old Title",
Tags: []string{"tag1"},
Tags: []string{"tag1", "tag3", "manual", "removeMe"},
},
SuggestedTitle: "New Title",
SuggestedTags: []string{"tag2"},
SuggestedTags: []string{"tag2", "tag3"},
RemoveTags: []string{"removeMe"},
},
}
idTag1 := 1
idTag2 := 2
idTag3 := 4
// Mock data for tags
tagsResponse := map[string]interface{}{
"results": []map[string]interface{}{
{"id": 1, "name": "tag1"},
{"id": 2, "name": "tag2"},
{"id": idTag1, "name": "tag1"},
{"id": idTag2, "name": "tag2"},
{"id": 3, "name": "manual"},
{"id": idTag3, "name": "tag3"},
{"id": 5, "name": "removeMe"},
},
"next": nil,
}
@ -342,7 +348,7 @@ func TestUpdateDocuments(t *testing.T) {
// Expected updated fields
expectedFields := map[string]interface{}{
"title": "New Title",
"tags": []interface{}{float64(2)}, // tag2 ID
"tags": []interface{}{float64(idTag1), float64(idTag2), float64(idTag3)}, // keep also previous tags
}
assert.Equal(t, expectedFields, updatedFields)
@ -385,7 +391,7 @@ func TestDownloadDocumentAsImages(t *testing.T) {
})
ctx := context.Background()
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID)
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 0)
require.NoError(t, err)
// Verify that exatly one page was extracted
@ -422,11 +428,11 @@ func TestDownloadDocumentAsImages_ManyPages(t *testing.T) {
env.client.CacheFolder = "tests/tmp"
// Clean the cache folder
os.RemoveAll(env.client.CacheFolder)
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID)
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 50)
require.NoError(t, err)
// Verify that exatly 52 pages were extracted
assert.Len(t, imagePaths, 52)
// Verify that exatly 50 pages were extracted - the original doc contains 52 pages
assert.Len(t, imagePaths, 50)
// The path shall end with tests/tmp/document-321/page000.jpg
for _, imagePath := range imagePaths {
_, err := os.Stat(imagePath)

View file

@ -2,5 +2,19 @@
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
"extends": [
"config:recommended"
],
"customManagers": [
{
"customType": "regex",
"description": "Update VERSION variables in Dockerfiles",
"fileMatch": [
"^Dockerfile$"
],
"matchStrings": [
"# renovate: datasource=(?<datasource>[a-z-]+?) depName=(?<depName>.+?)(?: versioning=(?<versioning>[a-z-]+?))?\\s(?:ENV|ARG) .+?_VERSION=\"(?<currentValue>.+?)\"\\s",
"# renovate: datasource=(?<datasource>[a-z-]+?) depName=(?<depName>.+?)(?: versioning=(?<versioning>[a-z-]+?))?\\s(?:ENV|ARG) VERSION=\"(?<currentValue>.+?)\"\\s"
],
"versioningTemplate": "{{#if versioning}}{{versioning}}{{else}}semver{{/if}}"
}
]
}

View file

@ -81,6 +81,7 @@ type DocumentSuggestion struct {
SuggestedTags []string `json:"suggested_tags,omitempty"`
SuggestedContent string `json:"suggested_content,omitempty"`
SuggestedCorrespondent string `json:"suggested_correspondent,omitempty"`
RemoveTags []string `json:"remove_tags,omitempty"`
}
type Correspondent struct {

7
version.go Normal file
View file

@ -0,0 +1,7 @@
package main
var (
version = "devVersion"
buildDate = "devBuildDate"
commit = "devCommit"
)

View file

@ -2889,12 +2889,16 @@
}
},
"node_modules/lilconfig": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz",
"integrity": "sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==",
"version": "3.1.3",
"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz",
"integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=10"
"node": ">=14"
},
"funding": {
"url": "https://github.com/sponsors/antonk52"
}
},
"node_modules/lines-and-columns": {
@ -3350,18 +3354,6 @@
}
}
},
"node_modules/postcss-load-config/node_modules/lilconfig": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz",
"integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==",
"dev": true,
"engines": {
"node": ">=14"
},
"funding": {
"url": "https://github.com/sponsors/antonk52"
}
},
"node_modules/postcss-nested": {
"version": "6.2.0",
"resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz",
@ -3944,33 +3936,34 @@
"integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew=="
},
"node_modules/tailwindcss": {
"version": "3.4.12",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.12.tgz",
"integrity": "sha512-Htf/gHj2+soPb9UayUNci/Ja3d8pTmu9ONTfh4QY8r3MATTZOzmv6UYWF7ZwikEIC8okpfqmGqrmDehua8mF8w==",
"version": "3.4.17",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.17.tgz",
"integrity": "sha512-w33E2aCvSDP0tW9RZuNXadXlkHXqFzSkQew/aIa2i/Sj8fThxwovwlXHSPXTbAHwEIhBFXAedUhP2tueAKP8Og==",
"dev": true,
"license": "MIT",
"dependencies": {
"@alloc/quick-lru": "^5.2.0",
"arg": "^5.0.2",
"chokidar": "^3.5.3",
"chokidar": "^3.6.0",
"didyoumean": "^1.2.2",
"dlv": "^1.1.3",
"fast-glob": "^3.3.0",
"fast-glob": "^3.3.2",
"glob-parent": "^6.0.2",
"is-glob": "^4.0.3",
"jiti": "^1.21.0",
"lilconfig": "^2.1.0",
"micromatch": "^4.0.5",
"jiti": "^1.21.6",
"lilconfig": "^3.1.3",
"micromatch": "^4.0.8",
"normalize-path": "^3.0.0",
"object-hash": "^3.0.0",
"picocolors": "^1.0.0",
"postcss": "^8.4.23",
"picocolors": "^1.1.1",
"postcss": "^8.4.47",
"postcss-import": "^15.1.0",
"postcss-js": "^4.0.1",
"postcss-load-config": "^4.0.1",
"postcss-nested": "^6.0.1",
"postcss-selector-parser": "^6.0.11",
"resolve": "^1.22.2",
"sucrase": "^3.32.0"
"postcss-load-config": "^4.0.2",
"postcss-nested": "^6.2.0",
"postcss-selector-parser": "^6.1.2",
"resolve": "^1.22.8",
"sucrase": "^3.35.0"
},
"bin": {
"tailwind": "lib/cli.js",