diff --git a/.github/workflows/docker-build-and-push.yml b/.github/workflows/docker-build-and-push.yml index 361f876..09e65d6 100644 --- a/.github/workflows/docker-build-and-push.yml +++ b/.github/workflows/docker-build-and-push.yml @@ -96,3 +96,7 @@ jobs: cache-from: type=gha cache-to: type=gha,mode=max tags: ${{ env.TAGS }} + build-args: | + VERSION=${{ github.ref_type == 'tag' && github.ref_name || github.sha }} + COMMIT=${{ github.sha }} + BUILD_DATE=${{ github.event.repository.pushed_at }} diff --git a/Dockerfile b/Dockerfile index 92e32c9..42b5697 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,33 @@ +# Define top-level build arguments +ARG VERSION=docker-dev +ARG COMMIT=unknown +ARG BUILD_DATE=unknown + # Stage 1: Build the Go binary -FROM golang:1.22-alpine AS builder +FROM golang:1.23.4-alpine3.21 AS builder # Set the working directory inside the container WORKDIR /app -# Install necessary packages -RUN apk add --no-cache \ - git \ - gcc \ - musl-dev \ - mupdf \ - mupdf-dev +# Package versions for Renovate +# renovate: datasource=repology depName=alpine_3_21/gcc versioning=loose +ENV GCC_VERSION=14.2.0-r4 +# renovate: datasource=repology depName=alpine_3_21/musl-dev versioning=loose +ENV MUSL_DEV_VERSION=1.2.5-r8 +# renovate: datasource=repology depName=alpine_3_21/mupdf versioning=loose +ENV MUPDF_VERSION=1.24.10-r0 +# renovate: datasource=repology depName=alpine_3_21/mupdf-dev versioning=loose +ENV MUPDF_DEV_VERSION=1.24.10-r0 +# renovate: datasource=repology depName=alpine_3_21/sed versioning=loose +ENV SED_VERSION=4.9-r2 +# Install necessary packages with pinned versions +RUN apk add --no-cache \ + "gcc=${GCC_VERSION}" \ + "musl-dev=${MUSL_DEV_VERSION}" \ + "mupdf=${MUPDF_VERSION}" \ + "mupdf-dev=${MUPDF_DEV_VERSION}" \ + "sed=${SED_VERSION}" # Copy go.mod and go.sum files COPY go.mod go.sum ./ @@ -24,6 +40,18 @@ RUN CGO_ENABLED=1 go build -tags musl -o /dev/null github.com/mattn/go-sqlite3 # Now copy the actual source files COPY *.go . +# Import ARGs from top level +ARG VERSION +ARG COMMIT +ARG BUILD_DATE + +# Update version information +RUN sed -i \ + -e "s/devVersion/${VERSION}/" \ + -e "s/devBuildDate/${BUILD_DATE}/" \ + -e "s/devCommit/${COMMIT}/" \ + version.go + # Build the binary using caching for both go modules and build cache RUN CGO_ENABLED=1 GOMAXPROCS=$(nproc) go build -tags musl -o paperless-gpt . @@ -51,6 +79,8 @@ RUN npm run build # Stage 3: Create a lightweight image with the Go binary and frontend FROM alpine:latest +ENV GIN_MODE=release + # Install necessary runtime dependencies RUN apk add --no-cache \ ca-certificates diff --git a/README.md b/README.md index c45de32..97464ea 100644 --- a/README.md +++ b/README.md @@ -1,122 +1,129 @@ # paperless-gpt - [![License](https://img.shields.io/github/license/icereed/paperless-gpt)](LICENSE) [![Docker Pulls](https://img.shields.io/docker/pulls/icereed/paperless-gpt)](https://hub.docker.com/r/icereed/paperless-gpt) [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](CODE_OF_CONDUCT.md) ![Screenshot](./paperless-gpt-screenshot.png) -**paperless-gpt** is a tool designed to generate accurate and meaningful document titles and tags for [paperless-ngx](https://github.com/paperless-ngx/paperless-ngx) using Large Language Models (LLMs). It supports multiple LLM providers, including **OpenAI** and **Ollama**. With paperless-gpt, you can streamline your document management by automatically suggesting appropriate titles and tags based on the content of your scanned documents. +**paperless-gpt** seamlessly pairs with [paperless-ngx][paperless-ngx] to generate **AI-powered document titles** and **tags**, saving you hours of manual sorting. While other tools may offer AI chat features, **paperless-gpt** stands out by **supercharging OCR with LLMs**—ensuring high accuracy, even with tricky scans. If you’re craving next-level text extraction and effortless document organization, this is your solution. -[![Demo](./demo.gif)](./demo.gif) +https://github.com/user-attachments/assets/bd5d38b9-9309-40b9-93ca-918dfa4f3fd4 -## Features +--- -- **Multiple LLM Support**: Choose between OpenAI and Ollama for generating document titles and tags. -- **Customizable Prompts**: Modify the prompt templates to suit your specific needs. -- **Easy Integration**: Works seamlessly with your existing paperless-ngx setup. -- **User-Friendly Interface**: Intuitive web interface for reviewing and applying suggested titles and tags. -- **Dockerized Deployment**: Simple setup using Docker and Docker Compose. -- **Automatic Document Processing**: Automatically apply generated suggestions for documents with the `paperless-gpt-auto` tag. -- **Experimental OCR Feature**: Send documents to a vision LLM for OCR processing. +## Key Highlights + +1. **LLM-Enhanced OCR** + Harness Large Language Models (OpenAI or Ollama) for **better-than-traditional** OCR—turn messy or low-quality scans into context-aware, high-fidelity text. + +2. **Automatic Title & Tag Generation** + No more guesswork. Let the AI do the naming and categorizing. You can easily review suggestions and refine them if needed. + +3. **Extensive Customization** + - **Prompt Templates**: Tweak your AI prompts to reflect your domain, style, or preference. + - **Tagging**: Decide how documents get tagged—manually, automatically, or via OCR-based flows. + +4. **Simple Docker Deployment** + A few environment variables, and you’re off! Compose it alongside paperless-ngx with minimal fuss. + +5. **Unified Web UI** + - **Manual Review**: Approve or tweak AI’s suggestions. + - **Auto Processing**: Focus only on edge cases while the rest is sorted for you. + +6. **Opt-In LLM-based OCR** + If you opt in, your images get read by a Vision LLM, pushing boundaries beyond standard OCR tools. + +--- ## Table of Contents +- [Key Highlights](#key-highlights) +- [Getting Started](#getting-started) + - [Prerequisites](#prerequisites) + - [Installation](#installation) + - [Docker Compose](#docker-compose) + - [Manual Setup](#manual-setup) +- [Configuration](#configuration) + - [Environment Variables](#environment-variables) + - [Custom Prompt Templates](#custom-prompt-templates) + - [Prompt Templates Directory](#prompt-templates-directory) + - [Mounting the Prompts Directory](#mounting-the-prompts-directory) + - [Editing the Prompt Templates](#editing-the-prompt-templates) + - [Template Syntax and Variables](#template-syntax-and-variables) +- [OCR using AI](#llm-based-ocr-compare-for-yourself) +- [Usage](#usage) +- [Contributing](#contributing) +- [License](#license) +- [Star History](#star-history) +- [Disclaimer](#disclaimer) -- [paperless-gpt](#paperless-gpt) - - [Features](#features) - - [Table of Contents](#table-of-contents) - - [Getting Started](#getting-started) - - [Prerequisites](#prerequisites) - - [Installation](#installation) - - [Docker Compose](#docker-compose) - - [Manual Setup](#manual-setup) - - [Configuration](#configuration) - - [Environment Variables](#environment-variables) - - [Custom Prompt Templates](#custom-prompt-templates) - - [Prompt Templates Directory](#prompt-templates-directory) - - [Mounting the Prompts Directory](#mounting-the-prompts-directory) - - [Editing the Prompt Templates](#editing-the-prompt-templates) - - [Template Syntax and Variables](#template-syntax-and-variables) - - [Usage](#usage) - - [Contributing](#contributing) - - [License](#license) - - [Star History](#star-history) +--- ## Getting Started ### Prerequisites - -- [Docker](https://www.docker.com/get-started) installed on your system. -- A running instance of [paperless-ngx](https://github.com/paperless-ngx/paperless-ngx). +- [Docker][docker-install] installed. +- A running instance of [paperless-ngx][paperless-ngx]. - Access to an LLM provider: - - **OpenAI**: An API key with access to models like `gpt-4o` or `gpt-3.5-turbo`. - - **Ollama**: A running Ollama server with models like `llama2` installed. + - **OpenAI**: An API key with models like `gpt-4o` or `gpt-3.5-turbo`. + - **Ollama**: A running Ollama server with models like `llama2`. ### Installation #### Docker Compose -The easiest way to get started is by using Docker Compose. Below is an example `docker-compose.yml` file to set up paperless-gpt alongside paperless-ngx. +Here’s an example `docker-compose.yml` to spin up **paperless-gpt** alongside paperless-ngx: ```yaml version: "3.7" services: paperless-ngx: image: ghcr.io/paperless-ngx/paperless-ngx:latest - # ... (your existing paperless-ngx configuration) + # ... (your existing paperless-ngx config) paperless-gpt: image: icereed/paperless-gpt:latest environment: - PAPERLESS_BASE_URL: "http://paperless-ngx:8000" - PAPERLESS_API_TOKEN: "your_paperless_api_token" - PAPERLESS_PUBLIC_URL: "http://paperless.mydomain.com" # Optional, your public link to access Paperless - MANUAL_TAG: "paperless-gpt" # Optional, default is 'paperless-gpt' - AUTO_TAG: "paperless-gpt-auto" # Optional, default is 'paperless-gpt-auto' - LLM_PROVIDER: "openai" # or 'ollama' - LLM_MODEL: "gpt-4o" # or 'llama2' - OPENAI_API_KEY: "your_openai_api_key" # Required if using OpenAI - LLM_LANGUAGE: "English" # Optional, default is 'English' - OLLAMA_HOST: "http://host.docker.internal:11434" # If using Ollama - VISION_LLM_PROVIDER: "ollama" # Optional (for OCR) - ollama or openai - VISION_LLM_MODEL: "minicpm-v" # Optional (for OCR) - minicpm-v, for example for ollama, gpt-4o for openai - AUTO_OCR_TAG: "paperless-gpt-ocr-auto" # Optional, default is 'paperless-gpt-ocr-auto' - LOG_LEVEL: "info" # Optional or 'debug', 'warn', 'error' + PAPERLESS_BASE_URL: 'http://paperless-ngx:8000' + PAPERLESS_API_TOKEN: 'your_paperless_api_token' + PAPERLESS_PUBLIC_URL: 'http://paperless.mydomain.com' # Optional + MANUAL_TAG: 'paperless-gpt' # Optional, default: paperless-gpt + AUTO_TAG: 'paperless-gpt-auto' # Optional, default: paperless-gpt-auto + LLM_PROVIDER: 'openai' # or 'ollama' + LLM_MODEL: 'gpt-4o' # or 'llama2' + OPENAI_API_KEY: 'your_openai_api_key' + # Optional - OPENAI_BASE_URL: 'https://litellm.yourinstallationof.it.com/v1' + LLM_LANGUAGE: 'English' # Optional, default: English + OLLAMA_HOST: 'http://host.docker.internal:11434' # If using Ollama + VISION_LLM_PROVIDER: 'ollama' # (for OCR) - openai or ollama + VISION_LLM_MODEL: 'minicpm-v' # (for OCR) - minicpm-v (ollama example), gpt-4o (for openai), etc. + AUTO_OCR_TAG: 'paperless-gpt-ocr-auto' # Optional, default: paperless-gpt-ocr-auto + OCR_LIMIT_PAGES: '5' # Optional, default: 5. Set to 0 for no limit. + LOG_LEVEL: 'info' # Optional: debug, warn, error volumes: - - ./prompts:/app/prompts # Mount the prompts directory + - ./prompts:/app/prompts # Mount the prompts directory ports: - "8080:8080" depends_on: - paperless-ngx ``` -**Note:** Replace the placeholder values with your actual configuration. +**Pro Tip**: Replace placeholders with real values and read the logs if something looks off. #### Manual Setup - -If you prefer to run the application manually: - -1. **Clone the Repository:** - +1. **Clone the Repository** ```bash git clone https://github.com/icereed/paperless-gpt.git cd paperless-gpt ``` - -2. **Create a `prompts` Directory:** - +2. **Create a `prompts` Directory** ```bash mkdir prompts ``` - -3. **Build the Docker Image:** - +3. **Build the Docker Image** ```bash docker build -t paperless-gpt . ``` - -4. **Run the Container:** - +4. **Run the Container** ```bash docker run -d \ -e PAPERLESS_BASE_URL='http://your_paperless_ngx_url' \ @@ -128,201 +135,313 @@ If you prefer to run the application manually: -e VISION_LLM_PROVIDER='ollama' \ -e VISION_LLM_MODEL='minicpm-v' \ -e LOG_LEVEL='info' \ - -v $(pwd)/prompts:/app/prompts \ # Mount the prompts directory + -v $(pwd)/prompts:/app/prompts \ -p 8080:8080 \ paperless-gpt ``` +--- + ## Configuration ### Environment Variables -| Variable | Description | Required | -| -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | -------- | -| `PAPERLESS_BASE_URL` | The base URL of your paperless-ngx instance (e.g., `http://paperless-ngx:8000`). | Yes | -| `PAPERLESS_API_TOKEN` | API token for accessing paperless-ngx. You can generate one in the paperless-ngx admin interface. | Yes | -| `PAPERLESS_PUBLIC_URL` | The public URL for your Paperless instance, if it is different to your `PAPERLESS_BASE_URL` - say if you are running in Docker Compose | No | -| `MANUAL_TAG` | The tag to use for manually processing documents. Default is `paperless-gpt`. | No | -| `AUTO_TAG` | The tag to use for automatically processing documents. Default is `paperless-gpt-auto`. | No | -| `LLM_PROVIDER` | The LLM provider to use (`openai` or `ollama`). | Yes | -| `LLM_MODEL` | The model name to use (e.g., `gpt-4o`, `gpt-3.5-turbo`, `llama2`). | Yes | -| `OPENAI_API_KEY` | Your OpenAI API key. Required if using OpenAI as the LLM provider. | Cond. | -| `LLM_LANGUAGE` | The likely language of your documents (e.g., `English`, `German`). Default is `English`. | No | -| `OLLAMA_HOST` | The URL of the Ollama server (e.g., `http://host.docker.internal:11434`). Useful if using Ollama. Default is `http://127.0.0.1:11434`. | No | -| `VISION_LLM_PROVIDER` | The vision LLM provider to use for OCR (`openai` or `ollama`). | No | -| `VISION_LLM_MODEL` | The model name to use for OCR (e.g., `minicpm-v`). | No | -| `AUTO_OCR_TAG` | The tag to use for automatically processing documents with OCR. Default is `paperless-gpt-ocr-auto`. | No | -| `LOG_LEVEL` | The log level for the application (`info`, `debug`, `warn`, `error`). Default is `info`. | No | -| `LISTEN_INTERFACE` | The interface paperless-gpt listens to. Default is `:8080` | No | -| `WEBUI_PATH` | The path to load static content from. Default is `./web-app/dist` | No | -| `AUTO_GENERATE_TITLE` | Enable/disable title generation when automatically applying suggestions with `paperless-gpt-auto`. Default is `true` | No | -| `AUTO_GENERATE_TAGS` | Enable/disable tag generation when automatically applying suggestions with `paperless-gpt-auto`. Default is `true` | No | -| `CORRESPONDENT_BLACK_LIST` | A comma-separated list of names to exclude from the correspondents suggestions. Example: `John Doe, Jane Smith`. | No | - **Note:** When using Ollama, ensure that the Ollama server is running and accessible from the paperless-gpt container. +======= +| Variable | Description | Required | +|------------------------|------------------------------------------------------------------------------------------------------------------|----------| +| `PAPERLESS_BASE_URL` | URL of your paperless-ngx instance (e.g. `http://paperless-ngx:8000`). | Yes | +| `PAPERLESS_API_TOKEN` | API token for paperless-ngx. Generate one in paperless-ngx admin. | Yes | +| `PAPERLESS_PUBLIC_URL` | Public URL for Paperless (if different from `PAPERLESS_BASE_URL`). | No | +| `MANUAL_TAG` | Tag for manual processing. Default: `paperless-gpt`. | No | +| `AUTO_TAG` | Tag for auto processing. Default: `paperless-gpt-auto`. | No | +| `LLM_PROVIDER` | AI backend (`openai` or `ollama`). | Yes | +| `LLM_MODEL` | AI model name, e.g. `gpt-4o`, `gpt-3.5-turbo`, `llama2`. | Yes | +| `OPENAI_API_KEY` | OpenAI API key (required if using OpenAI). | Cond. | +| `OPENAI_BASE_URL` | OpenAI base URL (optional, if using a custom OpenAI compatible service like LiteLLM). | No | +| `LLM_LANGUAGE` | Likely language for documents (e.g. `English`). Default: `English`. | No | +| `OLLAMA_HOST` | Ollama server URL (e.g. `http://host.docker.internal:11434`). | No | +| `VISION_LLM_PROVIDER` | AI backend for OCR (`openai` or `ollama`). | No | +| `VISION_LLM_MODEL` | Model name for OCR (e.g. `minicpm-v`). | No | +| `AUTO_OCR_TAG` | Tag for automatically processing docs with OCR. Default: `paperless-gpt-ocr-auto`. | No | +| `LOG_LEVEL` | Application log level (`info`, `debug`, `warn`, `error`). Default: `info`. | No | +| `LISTEN_INTERFACE` | Network interface to listen on. Default: `:8080`. | No | +| `WEBUI_PATH` | Path for static content. Default: `./web-app/dist`. | No | +| `AUTO_GENERATE_TITLE` | Generate titles automatically if `paperless-gpt-auto` is used. Default: `true`. | No | +| `AUTO_GENERATE_TAGS` | Generate tags automatically if `paperless-gpt-auto` is used. Default: `true`. | No | +| `AUTO_GENERATE_CORRESPONDENTS` | Generate correspondents automatically if `paperless-gpt-auto` is used. Default: `true`. | No | +| `OCR_LIMIT_PAGES` | Limit the number of pages for OCR. Set to `0` for no limit. Default: `5`. | No | +| `CORRESPONDENT_BLACK_LIST` | A comma-separated list of names to exclude from the correspondents suggestions. Example: `John Doe, Jane Smith`. ### Custom Prompt Templates -You can customize the prompt templates used by paperless-gpt to generate titles and tags. By default, the application uses built-in templates, but you can modify them by editing the template files. +paperless-gpt’s flexible **prompt templates** let you shape how AI responds: -#### Prompt Templates Directory +1. **`title_prompt.tmpl`**: For document titles. +2. **`tag_prompt.tmpl`**: For tagging logic. +3. **`ocr_prompt.tmpl`**: For LLM OCR. -The prompt templates are stored in the `prompts` directory inside the application. The two main template files are: - -- `title_prompt.tmpl`: Template used for generating document titles. -- `tag_prompt.tmpl`: Template used for generating document tags. - -#### Mounting the Prompts Directory - -To modify the prompt templates, you need to mount a local `prompts` directory into the container. - -**Docker Compose Example:** +Mount them into your container via: ```yaml -services: - paperless-gpt: - image: icereed/paperless-gpt:latest - # ... (other configurations) - volumes: - - ./prompts:/app/prompts # Mount the prompts directory + volumes: + - ./prompts:/app/prompts ``` -**Docker Run Command Example:** +Then tweak at will—**paperless-gpt** reloads them automatically on startup! -```bash -docker run -d \ - # ... (other configurations) - -v $(pwd)/prompts:/app/prompts \ - paperless-gpt -``` - -#### Editing the Prompt Templates - -1. **Start the Container:** - - When you first start the container with the `prompts` directory mounted, it will automatically create the default template files in your local `prompts` directory if they do not exist. - -2. **Edit the Template Files:** - - - Open `prompts/title_prompt.tmpl` and `prompts/tag_prompt.tmpl` with your favorite text editor. - - Modify the templates using Go's `text/template` syntax. - - Save the changes. - -3. **Restart the Container (if necessary):** - - The application automatically reloads the templates when it starts. If the container is already running, you may need to restart it to apply the changes. - -#### Template Syntax and Variables - -The templates use Go's `text/template` syntax and have access to the following variables: - -- **For `title_prompt.tmpl`:** - - - `{{.Language}}`: The language specified in `LLM_LANGUAGE` (default is `English`). - - `{{.Content}}`: The content of the document. - -- **For `tag_prompt.tmpl`:** - - - `{{.Language}}`: The language specified in `LLM_LANGUAGE`. - - `{{.AvailableTags}}`: A list (array) of available tags from paperless-ngx. - - `{{.Title}}`: The suggested title for the document. - - `{{.Content}}`: The content of the document. - -**Example `title_prompt.tmpl`:** - -```text -I will provide you with the content of a document that has been partially read by OCR (so it may contain errors). -Your task is to find a suitable document title that I can use as the title in the paperless-ngx program. -Respond only with the title, without any additional information. The content is likely in {{.Language}}. - -Be sure to add one fitting emoji at the beginning of the title to make it more visually appealing. - -Content: -{{.Content}} -``` - -**Example `tag_prompt.tmpl`:** - -```text -I will provide you with the content and the title of a document. Your task is to select appropriate tags for the document from the list of available tags I will provide. Only select tags from the provided list. Respond only with the selected tags as a comma-separated list, without any additional information. The content is likely in {{.Language}}. - -Available Tags: -{{.AvailableTags | join ","}} - -Title: -{{.Title}} - -Content: -{{.Content}} - -Please concisely select the {{.Language}} tags from the list above that best describe the document. -Be very selective and only choose the most relevant tags since too many tags will make the document less discoverable. -``` - -**Note:** Advanced users can utilize additional functions from the [Sprig](http://masterminds.github.io/sprig/) template library, as it is included in the application. +--- ## Usage -1. **Tag Documents in paperless-ngx:** +1. **Tag Documents** + - Add `paperless-gpt` or your custom tag to the docs you want to AI-ify. - - Add the tag `paperless-gpt` to documents you want to process. This tag is configurable via the `tagToFilter` variable in the code (default is `paperless-gpt`). +2. **Visit Web UI** + - Go to `http://localhost:8080` (or your host) in your browser. -2. **Access the paperless-gpt Interface:** +3. **Generate & Apply Suggestions** + - Click “Generate Suggestions” to see AI-proposed titles/tags. + - Approve, edit, or discard. Hit “Apply” to finalize in paperless-ngx. - - Open your browser and navigate to `http://localhost:8080`. +4. **Try LLM-Based OCR (Experimental)** + - If you enabled `VISION_LLM_PROVIDER` and `VISION_LLM_MODEL`, let AI-based OCR read your scanned PDFs. + - Tag those documents with `paperless-gpt-ocr-auto` (or your custom `AUTO_OCR_TAG`). -3. **Process Documents:** +**Tip**: The entire pipeline can be **fully automated** if you prefer minimal manual intervention. - - Click on **"Generate Suggestions"** to let the LLM generate title suggestions based on the document content. +--- -4. **Review and Apply Titles and Tags:** +## LLM-Based OCR: Compare for Yourself - - Review the suggested titles. You can edit them if necessary. - - Click on **"Apply Suggestions"** to update the document titles in paperless-ngx. +
+Click to expand the vanilla OCR vs. AI-powered OCR comparison -5. **Experimental OCR Feature:** +### Example 1 - - Send documents to a vision LLM for OCR processing. - - Example configuration to enable OCR with Ollama: - ```env - VISION_LLM_PROVIDER=ollama - VISION_LLM_MODEL=minicpm-v - ``` +**Image**: + +![Image](demo/ocr-example1.jpg) + +**Vanilla Paperless-ngx OCR**: +``` +La Grande Recre + +Gentre Gommercial 1'Esplanade +1349 LOLNAIN LA NEWWE +TA BERBOGAAL Tel =. 010 45,96 12 +Ticket 1440112 03/11/2006 a 13597: +4007176614518. DINOS. TYRAMNESA +TOTAET.T.LES +ReslE par Lask-Euron +Rencu en Cash Euro +V.14.6 -Hotgese = VALERTE +TICKET A-GONGERVER PORR TONT. EEHANGE +HERET ET A BIENTOT +``` + +**LLM-Powered OCR (OpenAI gpt-4o)**: +``` +La Grande Récré +Centre Commercial l'Esplanade +1348 LOUVAIN LA NEUVE +TVA 860826401 Tel : 010 45 95 12 +Ticket 14421 le 03/11/2006 à 15:27:18 +4007176614518 DINOS TYRANNOSA 14.90 +TOTAL T.T.C. 14.90 +Réglé par Cash Euro 50.00 +Rendu en Cash Euro 35.10 +V.14.6 Hôtesse : VALERIE +TICKET A CONSERVER POUR TOUT ECHANGE +MERCI ET A BIENTOT +``` + +--- + +### Example 2 + +**Image**: + +![Image](demo/ocr-example2.jpg) + +**Vanilla Paperless-ngx OCR**: +``` +Invoice Number: 1-996-84199 + +Fed: Invoica Date: Sep01, 2014 +Accaunt Number: 1334-8037-4 +Page: 1012 + +Fod£x Tax ID 71.0427007 + +IRISINC +SHARON ANDERSON +4731 W ATLANTIC AVE STE BI +DELRAY BEACH FL 33445-3897 ’ a +Invoice Questions? + +Bing, ‚Account Shipping Address: Contact FedEx Reı + +ISINC +4731 W ATLANTIC AVE Phone: (800) 622-1147 M-F 7-6 (CST) +DELRAY BEACH FL 33445-3897 US Fax: (800) 548-3020 + +Internet: www.fedex.com + +Invoice Summary Sep 01, 2014 + +FodEx Ground Services +Other Charges 11.00 +Total Charges 11.00 Da £ +> +polo) Fz// /G +TOTAL THIS INVOICE .... usps 11.00 P 2/1 f + +‘The only charges accrued for this period is the Weekly Service Charge. + +The Fedix Ground aceounts teferencedin his involce have been transteired and assigned 10, are owned by,andare payable to FedEx Express: + +To onsurs propor credit, plasa raturn this portion wirh your payment 10 FodEx +‚Please do not staple or fold. Ploase make your chack payablı to FedEx. + +[TI For change ol address, hc har and camphat lrm or never ide + +Remittance Advice +Your payment is due by Sep 16, 2004 + +Number Number Dus + +1334803719968 41993200000110071 + +AT 01 0391292 468448196 A**aDGT + +IRISINC Illallun elalalssollallansdHilalellund +SHARON ANDERSON + +4731 W ATLANTIC AVE STEBI FedEx + +DELRAY BEACH FL 334453897 PO. Box 94516 + +PALATINE IL 60094-4515 +``` + +**LLM-Powered OCR (OpenAI gpt-4o)**: +``` +FedEx. Invoice Number: 1-996-84199 + Invoice Date: Sep 01, 2014 + Account Number: 1334-8037-4 + Page: 1 of 2 + FedEx Tax ID: 71-0427007 + +I R I S INC +SHARON ANDERSON +4731 W ATLANTIC AVE STE B1 +DELRAY BEACH FL 33445-3897 + Invoice Questions? +Billing Account Shipping Address: Contact FedEx Revenue Services +I R I S INC Phone: (800) 622-1147 M-F 7-6 (CST) +4731 W ATLANTIC AVE Fax: (800) 548-3020 +DELRAY BEACH FL 33445-3897 US Internet: www.fedex.com + +Invoice Summary Sep 01, 2014 + +FedEx Ground Services +Other Charges 11.00 + +Total Charges .......................................................... USD $ 11.00 + +TOTAL THIS INVOICE .............................................. USD $ 11.00 + +The only charges accrued for this period is the Weekly Service Charge. + + RECEIVED + SEP _ 8 REC'D + BY: _ + + posted 9/21/14 + +The FedEx Ground accounts referenced in this invoice have been transferred and assigned to, are owned by, and are payable to FedEx Express. + +To ensure proper credit, please return this portion with your payment to FedEx. +Please do not staple or fold. Please make your check payable to FedEx. + +❑ For change of address, check here and complete form on reverse side. + +Remittance Advice +Your payment is due by Sep 16, 2004 + +Invoice +Number +1-996-84199 + +Account +Number +1334-8037-4 + +Amount +Due +USD $ 11.00 + +133480371996841993200000110071 + +AT 01 031292 468448196 A**3DGT + +I R I S INC +SHARON ANDERSON +4731 W ATLANTIC AVE STE B1 +DELRAY BEACH FL 33445-3897 + +FedEx +P.O. Box 94515 +``` + +--- +
+ +**Why Does It Matter?** +- Traditional OCR often jumbles text from complex or low-quality scans. +- Large Language Models interpret context and correct likely errors, producing results that are more precise and readable. +- You can integrate these cleaned-up texts into your **paperless-ngx** pipeline for better tagging, searching, and archiving. + + + + +### How It Works + +- **Vanilla OCR** typically uses classical methods or Tesseract-like engines to extract text, which can result in garbled outputs for complex fonts or poor-quality scans. +- **LLM-Powered OCR** uses your chosen AI backend—OpenAI or Ollama—to interpret the image’s text in a more context-aware manner. This leads to fewer errors and more coherent text. + +--- ## Contributing -Contributions are welcome! Please read the [contributing guidelines](CONTRIBUTING.md) before submitting a pull request. +**Pull requests** and **issues** are welcome! +1. Fork the repo +2. Create a branch (`feature/my-awesome-update`) +3. Commit changes (`git commit -m "Improve X"`) +4. Open a PR -1. **Fork the Repository** +Check out our [contributing guidelines](CONTRIBUTING.md) for details. -2. **Create a Feature Branch** - - ```bash - git checkout -b feature/my-new-feature - ``` - -3. **Commit Your Changes** - - ```bash - git commit -am 'Add some feature' - ``` - -4. **Push to the Branch** - - ```bash - git push origin feature/my-new-feature - ``` - -5. **Create a Pull Request** +--- ## License -This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. +paperless-gpt is licensed under the [MIT License](LICENSE). Feel free to adapt and share! + +--- ## Star History - [![Star History Chart](https://api.star-history.com/svg?repos=icereed/paperless-gpt&type=Date)](https://star-history.com/#icereed/paperless-gpt&Date) --- -**Disclaimer:** This project is not affiliated with the official paperless-ngx project. Use at your own discretion. +## Disclaimer +This project is **not** officially affiliated with [paperless-ngx][paperless-ngx]. Use at your own risk. + +--- + +**paperless-gpt**: The **LLM-based** companion your doc management has been waiting for. Enjoy effortless, intelligent document titles, tags, and next-level OCR. + +[paperless-ngx]: https://github.com/paperless-ngx/paperless-ngx +[docker-install]: https://docs.docker.com/get-docker/ diff --git a/app_http_handlers.go b/app_http_handlers.go index 6fff276..58809ec 100644 --- a/app_http_handlers.go +++ b/app_http_handlers.go @@ -119,7 +119,7 @@ func (app *App) generateSuggestionsHandler(c *gin.Context) { return } - results, err := app.generateDocumentSuggestions(ctx, suggestionRequest) + results, err := app.generateDocumentSuggestions(ctx, suggestionRequest, log.WithContext(ctx)) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Sprintf("Error processing documents: %v", err)}) log.Errorf("Error processing documents: %v", err) diff --git a/app_llm.go b/app_llm.go index 7fae692..7547c06 100644 --- a/app_llm.go +++ b/app_llm.go @@ -5,9 +5,13 @@ import ( "context" "encoding/base64" "fmt" + "image" "strings" "sync" + _ "image/jpeg" + + "github.com/sirupsen/logrus" "github.com/tmc/langchaingo/llms" ) @@ -52,7 +56,12 @@ func (app *App) getSuggestedCorrespondent(ctx context.Context, content string, s } // getSuggestedTags generates suggested tags for a document using the LLM -func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedTitle string, availableTags []string) ([]string, error) { +func (app *App) getSuggestedTags( + ctx context.Context, + content string, + suggestedTitle string, + availableTags []string, + logger *logrus.Entry) ([]string, error) { likelyLanguage := getLikelyLanguage() templateMutex.RLock() @@ -66,11 +75,12 @@ func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedT "Content": content, }) if err != nil { + logger.Errorf("Error executing tag template: %v", err) return nil, fmt.Errorf("error executing tag template: %v", err) } prompt := promptBuffer.String() - log.Debugf("Tag suggestion prompt: %s", prompt) + logger.Debugf("Tag suggestion prompt: %s", prompt) completion, err := app.LLM.GenerateContent(ctx, []llms.MessageContent{ { @@ -83,6 +93,7 @@ func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedT }, }) if err != nil { + logger.Errorf("Error getting response from LLM: %v", err) return nil, fmt.Errorf("error getting response from LLM: %v", err) } @@ -106,7 +117,7 @@ func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedT return filteredTags, nil } -func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, error) { +func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte, logger *logrus.Entry) (string, error) { templateMutex.RLock() defer templateMutex.RUnlock() @@ -122,15 +133,27 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro prompt := promptBuffer.String() + // Log the image dimensions + img, _, err := image.Decode(bytes.NewReader(jpegBytes)) + if err != nil { + return "", fmt.Errorf("error decoding image: %v", err) + } + bounds := img.Bounds() + logger.Debugf("Image dimensions: %dx%d", bounds.Dx(), bounds.Dy()) + // If not OpenAI then use binary part for image, otherwise, use the ImageURL part with encoding from https://platform.openai.com/docs/guides/vision var parts []llms.ContentPart if strings.ToLower(visionLlmProvider) != "openai" { + // Log image size in kilobytes + logger.Debugf("Image size: %d KB", len(jpegBytes)/1024) parts = []llms.ContentPart{ llms.BinaryPart("image/jpeg", jpegBytes), llms.TextPart(prompt), } } else { base64Image := base64.StdEncoding.EncodeToString(jpegBytes) + // Log image size in kilobytes + logger.Debugf("Image size: %d KB", len(base64Image)/1024) parts = []llms.ContentPart{ llms.ImageURLPart(fmt.Sprintf("data:image/jpeg;base64,%s", base64Image)), llms.TextPart(prompt), @@ -154,7 +177,7 @@ func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, erro } // getSuggestedTitle generates a suggested title for a document using the LLM -func (app *App) getSuggestedTitle(ctx context.Context, content string) (string, error) { +func (app *App) getSuggestedTitle(ctx context.Context, content string, logger *logrus.Entry) (string, error) { likelyLanguage := getLikelyLanguage() templateMutex.RLock() @@ -171,7 +194,7 @@ func (app *App) getSuggestedTitle(ctx context.Context, content string) (string, prompt := promptBuffer.String() - log.Debugf("Title suggestion prompt: %s", prompt) + logger.Debugf("Title suggestion prompt: %s", prompt) completion, err := app.LLM.GenerateContent(ctx, []llms.MessageContent{ { @@ -191,7 +214,7 @@ func (app *App) getSuggestedTitle(ctx context.Context, content string) (string, } // generateDocumentSuggestions generates suggestions for a set of documents -func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionRequest GenerateSuggestionsRequest) ([]DocumentSuggestion, error) { +func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionRequest GenerateSuggestionsRequest, logger *logrus.Entry) ([]DocumentSuggestion, error) { // Fetch all available tags from paperless-ngx availableTagsMap, err := app.Client.GetAllTags(ctx) if err != nil { @@ -231,7 +254,8 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque go func(doc Document) { defer wg.Done() documentID := doc.ID - log.Printf("Processing Document ID %d...", documentID) + docLogger := documentLogger(documentID) + docLogger.Printf("Processing Document ID %d...", documentID) content := doc.Content if len(content) > 5000 { @@ -243,23 +267,23 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque var suggestedCorrespondent string if suggestionRequest.GenerateTitles { - suggestedTitle, err = app.getSuggestedTitle(ctx, content) + suggestedTitle, err = app.getSuggestedTitle(ctx, content, docLogger) if err != nil { mu.Lock() errorsList = append(errorsList, fmt.Errorf("Document %d: %v", documentID, err)) mu.Unlock() - log.Errorf("Error processing document %d: %v", documentID, err) + docLogger.Errorf("Error processing document %d: %v", documentID, err) return } } if suggestionRequest.GenerateTags { - suggestedTags, err = app.getSuggestedTags(ctx, content, suggestedTitle, availableTagNames) + suggestedTags, err = app.getSuggestedTags(ctx, content, suggestedTitle, availableTagNames, docLogger) if err != nil { mu.Lock() errorsList = append(errorsList, fmt.Errorf("Document %d: %v", documentID, err)) mu.Unlock() - log.Errorf("Error generating tags for document %d: %v", documentID, err) + logger.Errorf("Error generating tags for document %d: %v", documentID, err) return } } @@ -283,7 +307,7 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque } // Titles if suggestionRequest.GenerateTitles { - log.Printf("Suggested title for document %d: %s", documentID, suggestedTitle) + docLogger.Printf("Suggested title for document %d: %s", documentID, suggestedTitle) suggestion.SuggestedTitle = suggestedTitle } else { suggestion.SuggestedTitle = doc.Title @@ -291,10 +315,10 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque // Tags if suggestionRequest.GenerateTags { - log.Printf("Suggested tags for document %d: %v", documentID, suggestedTags) + docLogger.Printf("Suggested tags for document %d: %v", documentID, suggestedTags) suggestion.SuggestedTags = suggestedTags } else { - suggestion.SuggestedTags = removeTagFromList(doc.Tags, manualTag) + suggestion.SuggestedTags = doc.Tags } // Correspondents @@ -304,10 +328,12 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque } else { suggestion.SuggestedCorrespondent = "" } + // Remove manual tag from the list of suggested tags + suggestion.RemoveTags = []string{manualTag, autoTag} documentSuggestions = append(documentSuggestions, suggestion) mu.Unlock() - log.Printf("Document %d processed successfully.", documentID) + docLogger.Printf("Document %d processed successfully.", documentID) }(documents[i]) } diff --git a/demo.gif b/demo.gif deleted file mode 100644 index dc0603f..0000000 Binary files a/demo.gif and /dev/null differ diff --git a/demo.mp4 b/demo.mp4 new file mode 100644 index 0000000..6ce5550 Binary files /dev/null and b/demo.mp4 differ diff --git a/demo/ocr-example1.jpg b/demo/ocr-example1.jpg new file mode 100644 index 0000000..8fedd34 Binary files /dev/null and b/demo/ocr-example1.jpg differ diff --git a/demo/ocr-example2.jpg b/demo/ocr-example2.jpg new file mode 100644 index 0000000..928fc28 Binary files /dev/null and b/demo/ocr-example2.jpg differ diff --git a/go.mod b/go.mod index 880f413..2762c7d 100644 --- a/go.mod +++ b/go.mod @@ -2,7 +2,7 @@ module paperless-gpt go 1.22.0 -toolchain go1.22.2 +toolchain go1.23.4 require ( github.com/Masterminds/sprig/v3 v3.3.0 @@ -28,6 +28,7 @@ require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/dlclark/regexp2 v1.10.0 // indirect github.com/ebitengine/purego v0.8.0 // indirect + github.com/fatih/color v1.18.0 // indirect github.com/gabriel-vasile/mimetype v1.4.3 // indirect github.com/gin-contrib/sse v0.1.0 // indirect github.com/go-playground/locales v0.14.1 // indirect @@ -42,6 +43,7 @@ require ( github.com/jupiterrider/ffi v0.2.0 // indirect github.com/klauspost/cpuid/v2 v2.2.7 // indirect github.com/leodido/go-urn v1.4.0 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/mattn/go-sqlite3 v1.14.24 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect @@ -58,7 +60,7 @@ require ( golang.org/x/arch v0.8.0 // indirect golang.org/x/crypto v0.26.0 // indirect golang.org/x/net v0.25.0 // indirect - golang.org/x/sys v0.23.0 // indirect + golang.org/x/sys v0.25.0 // indirect golang.org/x/text v0.20.0 // indirect google.golang.org/protobuf v1.34.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 945e689..6af29a1 100644 --- a/go.sum +++ b/go.sum @@ -25,6 +25,8 @@ github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/ebitengine/purego v0.8.0 h1:JbqvnEzRvPpxhCJzJJ2y0RbiZ8nyjccVUrSM3q+GvvE= github.com/ebitengine/purego v0.8.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= +github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= +github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0= github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= github.com/gen2brain/go-fitz v1.24.14 h1:09weRkjVtLYNGo7l0J7DyOwBExbwi8SJ9h8YPhw9WEo= @@ -70,6 +72,9 @@ github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZY github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM= @@ -155,6 +160,7 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -162,6 +168,8 @@ golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= diff --git a/main.go b/main.go index 8279e6e..273f105 100644 --- a/main.go +++ b/main.go @@ -6,12 +6,15 @@ import ( "net/http" "os" "path/filepath" + "runtime" + "strconv" "strings" "sync" "text/template" "time" "github.com/Masterminds/sprig/v3" + "github.com/fatih/color" "github.com/gin-gonic/gin" "github.com/sirupsen/logrus" "github.com/tmc/langchaingo/llms" @@ -27,7 +30,6 @@ var ( log = logrus.New() // Environment Variables - correspondentBlackList = strings.Split(os.Getenv("CORRESPONDENT_BLACK_LIST"), ",") paperlessBaseURL = os.Getenv("PAPERLESS_BASE_URL") paperlessAPIToken = os.Getenv("PAPERLESS_API_TOKEN") openaiAPIKey = os.Getenv("OPENAI_API_KEY") @@ -45,6 +47,7 @@ var ( autoGenerateTitle = os.Getenv("AUTO_GENERATE_TITLE") autoGenerateTags = os.Getenv("AUTO_GENERATE_TAGS") autoGenerateCorrespondents = os.Getenv("AUTO_GENERATE_CORRESPONDENTS") + limitOcrPages int // Will be read from OCR_LIMIT_PAGES // Templates titleTemplate *template.Template @@ -121,6 +124,9 @@ func main() { // Initialize logrus logger initLogger() + // Print version + printVersion() + // Initialize PaperlessClient client := NewPaperlessClient(paperlessBaseURL, paperlessAPIToken) @@ -263,6 +269,29 @@ func main() { } } +func printVersion() { + cyan := color.New(color.FgCyan).SprintFunc() + yellow := color.New(color.FgYellow).SprintFunc() + + banner := ` + ╔═══════════════════════════════════════╗ + ║ Paperless GPT ║ + ╚═══════════════════════════════════════╝` + + fmt.Printf("%s\n", cyan(banner)) + fmt.Printf("\n%s %s\n", yellow("Version:"), version) + if commit != "" { + fmt.Printf("%s %s\n", yellow("Commit:"), commit) + } + if buildDate != "" { + fmt.Printf("%s %s\n", yellow("Build Date:"), buildDate) + } + fmt.Printf("%s %s/%s\n", yellow("Platform:"), runtime.GOOS, runtime.GOARCH) + fmt.Printf("%s %s\n", yellow("Go Version:"), runtime.Version()) + fmt.Printf("%s %s\n", yellow("Started:"), time.Now().Format(time.RFC1123)) + fmt.Println() +} + func initLogger() { switch logLevel { case "debug": @@ -338,6 +367,24 @@ func validateOrDefaultEnvVars() { if (llmProvider == "openai" || visionLlmProvider == "openai") && openaiAPIKey == "" { log.Fatal("Please set the OPENAI_API_KEY environment variable for OpenAI provider.") } + + if isOcrEnabled() { + rawLimitOcrPages := os.Getenv("OCR_LIMIT_PAGES") + if rawLimitOcrPages == "" { + limitOcrPages = 5 + } else { + var err error + limitOcrPages, err = strconv.Atoi(rawLimitOcrPages) + if err != nil { + log.Fatalf("Invalid OCR_LIMIT_PAGES value: %v", err) + } + } + } +} + +// documentLogger creates a logger with document context +func documentLogger(documentID int) *logrus.Entry { + return log.WithField("document_id", documentID) } // processAutoTagDocuments handles the background auto-tagging of documents @@ -356,23 +403,29 @@ func (app *App) processAutoTagDocuments() (int, error) { log.Debugf("Found at least %d remaining documents with tag %s", len(documents), autoTag) - suggestionRequest := GenerateSuggestionsRequest{ - Documents: documents, - GenerateTitles: strings.ToLower(autoGenerateTitle) != "false", - GenerateTags: strings.ToLower(autoGenerateTags) != "false", - GenerateCorrespondents: strings.ToLower(autoGenerateCorrespondents) != "false", - } + for _, document := range documents { + docLogger := documentLogger(document.ID) + docLogger.Info("Processing document for auto-tagging") - suggestions, err := app.generateDocumentSuggestions(ctx, suggestionRequest) - if err != nil { - return 0, fmt.Errorf("error generating suggestions: %w", err) - } + suggestionRequest := GenerateSuggestionsRequest{ + Documents: []Document{document}, + GenerateTitles: strings.ToLower(autoGenerateTitle) != "false", + GenerateTags: strings.ToLower(autoGenerateTags) != "false", + GenerateCorrespondents: strings.ToLower(autoGenerateCorrespondents) != "false", + } - err = app.Client.UpdateDocuments(ctx, suggestions, app.Database, false) - if err != nil { - return 0, fmt.Errorf("error updating documents: %w", err) - } + suggestions, err := app.generateDocumentSuggestions(ctx, suggestionRequest, docLogger) + if err != nil { + return 0, fmt.Errorf("error generating suggestions for document %d: %w", document.ID, err) + } + err = app.Client.UpdateDocuments(ctx, suggestions, app.Database, false) + if err != nil { + return 0, fmt.Errorf("error updating document %d: %w", document.ID, err) + } + + docLogger.Info("Successfully processed document") + } return len(documents), nil } @@ -392,26 +445,31 @@ func (app *App) processAutoOcrTagDocuments() (int, error) { log.Debugf("Found at least %d remaining documents with tag %s", len(documents), autoOcrTag) - documents = documents[:1] // Process only one document at a time + for _, document := range documents { + docLogger := documentLogger(document.ID) + docLogger.Info("Processing document for OCR") - ocrContent, err := app.ProcessDocumentOCR(ctx, documents[0].ID) - if err != nil { - return 0, fmt.Errorf("error processing document OCR: %w", err) + ocrContent, err := app.ProcessDocumentOCR(ctx, document.ID) + if err != nil { + return 0, fmt.Errorf("error processing OCR for document %d: %w", document.ID, err) + } + docLogger.Debug("OCR processing completed") + + err = app.Client.UpdateDocuments(ctx, []DocumentSuggestion{ + { + ID: document.ID, + OriginalDocument: document, + SuggestedContent: ocrContent, + RemoveTags: []string{autoOcrTag}, + }, + }, app.Database, false) + if err != nil { + return 0, fmt.Errorf("error updating document %d after OCR: %w", document.ID, err) + } + + docLogger.Info("Successfully processed document OCR") } - log.Debugf("OCR content for document %d: %s", documents[0].ID, ocrContent) - - err = app.Client.UpdateDocuments(ctx, []DocumentSuggestion{ - { - ID: documents[0].ID, - OriginalDocument: documents[0], - SuggestedContent: ocrContent, - }, - }, app.Database, false) - if err != nil { - return 0, fmt.Errorf("error updating documents: %w", err) - } - - return 1, nil // Processed one document + return 1, nil } // removeTagFromList removes a specific tag from a list of tags diff --git a/ocr.go b/ocr.go index ca8ed28..6819a98 100644 --- a/ocr.go +++ b/ocr.go @@ -9,31 +9,42 @@ import ( // ProcessDocumentOCR processes a document through OCR and returns the combined text func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int) (string, error) { - imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID) + docLogger := documentLogger(documentID) + docLogger.Info("Starting OCR processing") + + imagePaths, err := app.Client.DownloadDocumentAsImages(ctx, documentID, limitOcrPages) defer func() { for _, imagePath := range imagePaths { - os.Remove(imagePath) + if err := os.Remove(imagePath); err != nil { + docLogger.WithError(err).WithField("image_path", imagePath).Warn("Failed to remove temporary image file") + } } }() if err != nil { - return "", fmt.Errorf("error downloading document images: %w", err) + return "", fmt.Errorf("error downloading document images for document %d: %w", documentID, err) } + docLogger.WithField("page_count", len(imagePaths)).Debug("Downloaded document images") + var ocrTexts []string - for _, imagePath := range imagePaths { + for i, imagePath := range imagePaths { + pageLogger := docLogger.WithField("page", i+1) + pageLogger.Debug("Processing page") + imageContent, err := os.ReadFile(imagePath) if err != nil { - return "", fmt.Errorf("error reading image file: %w", err) + return "", fmt.Errorf("error reading image file for document %d, page %d: %w", documentID, i+1, err) } - ocrText, err := app.doOCRViaLLM(ctx, imageContent) + ocrText, err := app.doOCRViaLLM(ctx, imageContent, pageLogger) if err != nil { - return "", fmt.Errorf("error performing OCR: %w", err) + return "", fmt.Errorf("error performing OCR for document %d, page %d: %w", documentID, i+1, err) } - log.Debugf("OCR text: %s", ocrText) + pageLogger.Debug("OCR completed for page") ocrTexts = append(ocrTexts, ocrText) } + docLogger.Info("OCR processing completed successfully") return strings.Join(ocrTexts, "\n\n"), nil } diff --git a/paperless-gpt-screenshot.png b/paperless-gpt-screenshot.png index 99a70c1..2a8eda9 100644 Binary files a/paperless-gpt-screenshot.png and b/paperless-gpt-screenshot.png differ diff --git a/paperless.go b/paperless.go index e35068e..1aa5158 100644 --- a/paperless.go +++ b/paperless.go @@ -290,8 +290,9 @@ func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents [] } // remove autoTag to prevent infinite loop (even if it is in the original tags) - originalTags = removeTagFromList(originalTags, autoTag) - originalTags = removeTagFromList(originalTags, autoOcrTag) + for _, tag := range document.RemoveTags { + originalTags = removeTagFromList(originalTags, tag) + } if len(tags) == 0 { tags = originalTags @@ -300,6 +301,12 @@ func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents [] originalFields["tags"] = originalTags // remove autoTag to prevent infinite loop - this is required in case of undo tags = removeTagFromList(tags, autoTag) + + // keep previous tags + tags = append(tags, originalTags...) + // remove duplicates + slices.Sort(tags) + tags = slices.Compact(tags) } updatedTagsJSON, err := json.Marshal(tags) @@ -424,7 +431,8 @@ func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents [] } // DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images -func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int) ([]string, error) { +// If limitPages > 0, only the first N pages will be processed +func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int, limitPages int) ([]string, error) { // Create a directory named after the document ID docDir := filepath.Join(client.GetCacheFolder(), fmt.Sprintf("document-%d", documentId)) if _, err := os.Stat(docDir); os.IsNotExist(err) { @@ -437,6 +445,9 @@ func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, doc // Check if images already exist var imagePaths []string for n := 0; ; n++ { + if limitPages > 0 && n >= limitPages { + break + } imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n)) if _, err := os.Stat(imagePath); os.IsNotExist(err) { break @@ -485,10 +496,15 @@ func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, doc } defer doc.Close() + totalPages := doc.NumPage() + if limitPages > 0 && limitPages < totalPages { + totalPages = limitPages + } + var mu sync.Mutex var g errgroup.Group - for n := 0; n < doc.NumPage(); n++ { + for n := 0; n < totalPages; n++ { n := n // capture loop variable g.Go(func() error { mu.Lock() diff --git a/paperless_test.go b/paperless_test.go index 13cf103..c9be3a0 100644 --- a/paperless_test.go +++ b/paperless_test.go @@ -300,18 +300,24 @@ func TestUpdateDocuments(t *testing.T) { OriginalDocument: Document{ ID: 1, Title: "Old Title", - Tags: []string{"tag1"}, + Tags: []string{"tag1", "tag3", "manual", "removeMe"}, }, SuggestedTitle: "New Title", - SuggestedTags: []string{"tag2"}, + SuggestedTags: []string{"tag2", "tag3"}, + RemoveTags: []string{"removeMe"}, }, } + idTag1 := 1 + idTag2 := 2 + idTag3 := 4 // Mock data for tags tagsResponse := map[string]interface{}{ "results": []map[string]interface{}{ - {"id": 1, "name": "tag1"}, - {"id": 2, "name": "tag2"}, + {"id": idTag1, "name": "tag1"}, + {"id": idTag2, "name": "tag2"}, {"id": 3, "name": "manual"}, + {"id": idTag3, "name": "tag3"}, + {"id": 5, "name": "removeMe"}, }, "next": nil, } @@ -342,7 +348,7 @@ func TestUpdateDocuments(t *testing.T) { // Expected updated fields expectedFields := map[string]interface{}{ "title": "New Title", - "tags": []interface{}{float64(2)}, // tag2 ID + "tags": []interface{}{float64(idTag1), float64(idTag2), float64(idTag3)}, // keep also previous tags } assert.Equal(t, expectedFields, updatedFields) @@ -385,7 +391,7 @@ func TestDownloadDocumentAsImages(t *testing.T) { }) ctx := context.Background() - imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID) + imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 0) require.NoError(t, err) // Verify that exatly one page was extracted @@ -422,11 +428,11 @@ func TestDownloadDocumentAsImages_ManyPages(t *testing.T) { env.client.CacheFolder = "tests/tmp" // Clean the cache folder os.RemoveAll(env.client.CacheFolder) - imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID) + imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document.ID, 50) require.NoError(t, err) - // Verify that exatly 52 pages were extracted - assert.Len(t, imagePaths, 52) + // Verify that exatly 50 pages were extracted - the original doc contains 52 pages + assert.Len(t, imagePaths, 50) // The path shall end with tests/tmp/document-321/page000.jpg for _, imagePath := range imagePaths { _, err := os.Stat(imagePath) diff --git a/renovate.json b/renovate.json index 5db72dd..b67362b 100644 --- a/renovate.json +++ b/renovate.json @@ -2,5 +2,19 @@ "$schema": "https://docs.renovatebot.com/renovate-schema.json", "extends": [ "config:recommended" + ], + "customManagers": [ + { + "customType": "regex", + "description": "Update VERSION variables in Dockerfiles", + "fileMatch": [ + "^Dockerfile$" + ], + "matchStrings": [ + "# renovate: datasource=(?[a-z-]+?) depName=(?.+?)(?: versioning=(?[a-z-]+?))?\\s(?:ENV|ARG) .+?_VERSION=\"(?.+?)\"\\s", + "# renovate: datasource=(?[a-z-]+?) depName=(?.+?)(?: versioning=(?[a-z-]+?))?\\s(?:ENV|ARG) VERSION=\"(?.+?)\"\\s" + ], + "versioningTemplate": "{{#if versioning}}{{versioning}}{{else}}semver{{/if}}" + } ] } diff --git a/types.go b/types.go index 40cbf6f..6bb16a7 100644 --- a/types.go +++ b/types.go @@ -81,6 +81,7 @@ type DocumentSuggestion struct { SuggestedTags []string `json:"suggested_tags,omitempty"` SuggestedContent string `json:"suggested_content,omitempty"` SuggestedCorrespondent string `json:"suggested_correspondent,omitempty"` + RemoveTags []string `json:"remove_tags,omitempty"` } type Correspondent struct { diff --git a/version.go b/version.go new file mode 100644 index 0000000..750920c --- /dev/null +++ b/version.go @@ -0,0 +1,7 @@ +package main + +var ( + version = "devVersion" + buildDate = "devBuildDate" + commit = "devCommit" +) diff --git a/web-app/package-lock.json b/web-app/package-lock.json index fe204d6..f7624a8 100644 --- a/web-app/package-lock.json +++ b/web-app/package-lock.json @@ -2889,12 +2889,16 @@ } }, "node_modules/lilconfig": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", - "integrity": "sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==", + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", + "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", "dev": true, + "license": "MIT", "engines": { - "node": ">=10" + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antonk52" } }, "node_modules/lines-and-columns": { @@ -3350,18 +3354,6 @@ } } }, - "node_modules/postcss-load-config/node_modules/lilconfig": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.2.tgz", - "integrity": "sha512-eop+wDAvpItUys0FWkHIKeC9ybYrTGbU41U5K7+bttZZeohvnY7M9dZ5kB21GNWiFT2q1OoPTvncPCgSOVO5ow==", - "dev": true, - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/antonk52" - } - }, "node_modules/postcss-nested": { "version": "6.2.0", "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz", @@ -3944,33 +3936,34 @@ "integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew==" }, "node_modules/tailwindcss": { - "version": "3.4.12", - "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.12.tgz", - "integrity": "sha512-Htf/gHj2+soPb9UayUNci/Ja3d8pTmu9ONTfh4QY8r3MATTZOzmv6UYWF7ZwikEIC8okpfqmGqrmDehua8mF8w==", + "version": "3.4.17", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.17.tgz", + "integrity": "sha512-w33E2aCvSDP0tW9RZuNXadXlkHXqFzSkQew/aIa2i/Sj8fThxwovwlXHSPXTbAHwEIhBFXAedUhP2tueAKP8Og==", "dev": true, + "license": "MIT", "dependencies": { "@alloc/quick-lru": "^5.2.0", "arg": "^5.0.2", - "chokidar": "^3.5.3", + "chokidar": "^3.6.0", "didyoumean": "^1.2.2", "dlv": "^1.1.3", - "fast-glob": "^3.3.0", + "fast-glob": "^3.3.2", "glob-parent": "^6.0.2", "is-glob": "^4.0.3", - "jiti": "^1.21.0", - "lilconfig": "^2.1.0", - "micromatch": "^4.0.5", + "jiti": "^1.21.6", + "lilconfig": "^3.1.3", + "micromatch": "^4.0.8", "normalize-path": "^3.0.0", "object-hash": "^3.0.0", - "picocolors": "^1.0.0", - "postcss": "^8.4.23", + "picocolors": "^1.1.1", + "postcss": "^8.4.47", "postcss-import": "^15.1.0", "postcss-js": "^4.0.1", - "postcss-load-config": "^4.0.1", - "postcss-nested": "^6.0.1", - "postcss-selector-parser": "^6.0.11", - "resolve": "^1.22.2", - "sucrase": "^3.32.0" + "postcss-load-config": "^4.0.2", + "postcss-nested": "^6.2.0", + "postcss-selector-parser": "^6.1.2", + "resolve": "^1.22.8", + "sucrase": "^3.35.0" }, "bin": { "tailwind": "lib/cli.js",