feat: Download PDFs as images

This commit is contained in:
Dominik Schröter 2024-10-26 09:00:22 +02:00
parent 10df151525
commit 8c7bd7273f
7 changed files with 541 additions and 4 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
.env
.DS_Store
prompts/
tests/tmp

6
go.mod
View file

@ -6,7 +6,9 @@ toolchain go1.22.2
require (
github.com/Masterminds/sprig/v3 v3.2.3
github.com/gen2brain/go-fitz v1.24.14
github.com/gin-gonic/gin v1.10.0
github.com/stretchr/testify v1.9.0
github.com/tmc/langchaingo v0.1.12
)
@ -17,7 +19,9 @@ require (
github.com/bytedance/sonic/loader v0.1.1 // indirect
github.com/cloudwego/base64x v0.1.4 // indirect
github.com/cloudwego/iasm v0.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dlclark/regexp2 v1.10.0 // indirect
github.com/ebitengine/purego v0.8.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-playground/locales v0.14.1 // indirect
@ -28,6 +32,7 @@ require (
github.com/huandu/xstrings v1.3.3 // indirect
github.com/imdario/mergo v0.3.13 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/jupiterrider/ffi v0.2.0 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
@ -37,6 +42,7 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/pkoukk/tiktoken-go v0.1.6 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/shopspring/decimal v1.2.0 // indirect
github.com/spf13/cast v1.3.1 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect

6
go.sum
View file

@ -17,8 +17,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0=
github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
github.com/ebitengine/purego v0.8.0 h1:JbqvnEzRvPpxhCJzJJ2y0RbiZ8nyjccVUrSM3q+GvvE=
github.com/ebitengine/purego v0.8.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
github.com/gen2brain/go-fitz v1.24.14 h1:09weRkjVtLYNGo7l0J7DyOwBExbwi8SJ9h8YPhw9WEo=
github.com/gen2brain/go-fitz v1.24.14/go.mod h1:0KaZeQgASc20Yp5R/pFzyy7SmP01XcoHKNF842U2/S4=
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
@ -46,6 +50,8 @@ github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk=
github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/jupiterrider/ffi v0.2.0 h1:tMM70PexgYNmV+WyaYhJgCvQAvtTCs3wXeILPutihnA=
github.com/jupiterrider/ffi v0.2.0/go.mod h1:yqYqX5DdEccAsHeMn+6owkoI2llBLySVAF8dwCDZPVs=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=

View file

@ -5,10 +5,15 @@ import (
"context"
"encoding/json"
"fmt"
"image/jpeg"
"io"
"log"
"net/http"
"os"
"path/filepath"
"strings"
"github.com/gen2brain/go-fitz"
)
// PaperlessClient struct to interact with the Paperless-NGX API
@ -16,6 +21,7 @@ type PaperlessClient struct {
BaseURL string
APIToken string
HTTPClient *http.Client
CacheFolder string
}
// NewPaperlessClient creates a new instance of PaperlessClient with a default HTTP client
@ -239,6 +245,112 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum
return nil
}
// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document Document) ([]string, error) {
// Create a directory named after the document ID
docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", document.ID))
if _, err := os.Stat(docDir); os.IsNotExist(err) {
err = os.MkdirAll(docDir, 0755)
if err != nil {
return nil, err
}
}
// Check if images already exist
var imagePaths []string
for n := 0; ; n++ {
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
if _, err := os.Stat(imagePath); os.IsNotExist(err) {
break
}
imagePaths = append(imagePaths, imagePath)
}
// If images exist, return them
if len(imagePaths) > 0 {
return imagePaths, nil
}
// Proceed with downloading and converting the document to images
path := fmt.Sprintf("api/documents/%d/download/", document.ID)
resp, err := c.Do(ctx, "GET", path, nil)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
bodyBytes, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("error downloading document %d: %d, %s", document.ID, resp.StatusCode, string(bodyBytes))
}
pdfData, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}
tmpFile, err := os.CreateTemp("", "document-*.pdf")
if err != nil {
return nil, err
}
defer os.Remove(tmpFile.Name())
_, err = tmpFile.Write(pdfData)
if err != nil {
return nil, err
}
tmpFile.Close()
doc, err := fitz.New(tmpFile.Name())
if err != nil {
return nil, err
}
defer doc.Close()
for n := 0; n < doc.NumPage(); n++ {
img, err := doc.Image(n)
if err != nil {
return nil, err
}
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
f, err := os.Create(imagePath)
if err != nil {
return nil, err
}
err = jpeg.Encode(f, img, &jpeg.Options{Quality: jpeg.DefaultQuality})
if err != nil {
return nil, err
}
f.Close()
// Verify the JPEG file
file, err := os.Open(imagePath)
if err != nil {
return nil, err
}
defer file.Close()
_, err = jpeg.Decode(file)
if err != nil {
return nil, fmt.Errorf("invalid JPEG file: %s", imagePath)
}
imagePaths = append(imagePaths, imagePath)
}
return imagePaths, nil
}
// GetCacheFolder returns the cache folder for the PaperlessClient
func (c *PaperlessClient) GetCacheFolder() string {
if c.CacheFolder == "" {
c.CacheFolder = filepath.Join(os.TempDir(), "paperless-gpt")
}
return c.CacheFolder
}
// urlEncode encodes a string for safe URL usage
func urlEncode(s string) string {
return strings.ReplaceAll(s, " ", "+")

412
paperless_test.go Normal file
View file

@ -0,0 +1,412 @@
package main
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/http/httptest"
"os"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// Helper struct to hold common test data and methods
type testEnv struct {
t *testing.T
server *httptest.Server
client *PaperlessClient
requestCount int
mockResponses map[string]http.HandlerFunc
}
// newTestEnv initializes a new test environment
func newTestEnv(t *testing.T) *testEnv {
env := &testEnv{
t: t,
mockResponses: make(map[string]http.HandlerFunc),
}
// Create a mock server with a handler that dispatches based on URL path
env.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
env.requestCount++
handler, exists := env.mockResponses[r.URL.Path]
if !exists {
t.Fatalf("Unexpected request URL: %s", r.URL.Path)
}
// Set common headers and invoke the handler
assert.Equal(t, "Token test-token", r.Header.Get("Authorization"))
handler(w, r)
}))
// Initialize the PaperlessClient with the mock server URL
env.client = NewPaperlessClient(env.server.URL, "test-token")
env.client.HTTPClient = env.server.Client()
return env
}
// teardown closes the mock server
func (env *testEnv) teardown() {
env.server.Close()
}
// Helper method to set a mock response for a specific path
func (env *testEnv) setMockResponse(path string, handler http.HandlerFunc) {
env.mockResponses[path] = handler
}
// TestNewPaperlessClient tests the creation of a new PaperlessClient instance
func TestNewPaperlessClient(t *testing.T) {
baseURL := "http://example.com"
apiToken := "test-token"
client := NewPaperlessClient(baseURL, apiToken)
assert.Equal(t, "http://example.com", client.BaseURL)
assert.Equal(t, apiToken, client.APIToken)
assert.NotNil(t, client.HTTPClient)
}
// TestDo tests the Do method of PaperlessClient
func TestDo(t *testing.T) {
env := newTestEnv(t)
defer env.teardown()
// Set mock response for "/test-path"
env.setMockResponse("/test-path", func(w http.ResponseWriter, r *http.Request) {
// Verify the request method
assert.Equal(t, "GET", r.Method)
// Send a mock response
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"message": "success"}`))
})
ctx := context.Background()
resp, err := env.client.Do(ctx, "GET", "/test-path", nil)
require.NoError(t, err)
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
assert.Equal(t, http.StatusOK, resp.StatusCode)
assert.Equal(t, `{"message": "success"}`, string(body))
}
// TestGetAllTags tests the GetAllTags method, including pagination
func TestGetAllTags(t *testing.T) {
env := newTestEnv(t)
defer env.teardown()
// Mock data for paginated responses
page1 := map[string]interface{}{
"results": []map[string]interface{}{
{"id": 1, "name": "tag1"},
{"id": 2, "name": "tag2"},
},
"next": fmt.Sprintf("%s/api/tags/?page=2", env.server.URL),
}
page2 := map[string]interface{}{
"results": []map[string]interface{}{
{"id": 3, "name": "tag3"},
},
"next": nil,
}
// Set mock responses for pagination
env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query().Get("page")
if query == "2" {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(page2)
} else {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(page1)
}
})
ctx := context.Background()
tags, err := env.client.GetAllTags(ctx)
require.NoError(t, err)
expectedTags := map[string]int{
"tag1": 1,
"tag2": 2,
"tag3": 3,
}
assert.Equal(t, expectedTags, tags)
}
// TestGetDocumentsByTags tests the GetDocumentsByTags method
func TestGetDocumentsByTags(t *testing.T) {
env := newTestEnv(t)
defer env.teardown()
// Mock data for documents
documentsResponse := GetDocumentsApiResponse{
Results: []struct {
ID int `json:"id"`
Correspondent interface{} `json:"correspondent"`
DocumentType interface{} `json:"document_type"`
StoragePath interface{} `json:"storage_path"`
Title string `json:"title"`
Content string `json:"content"`
Tags []int `json:"tags"`
Created time.Time `json:"created"`
CreatedDate string `json:"created_date"`
Modified time.Time `json:"modified"`
Added time.Time `json:"added"`
ArchiveSerialNumber interface{} `json:"archive_serial_number"`
OriginalFileName string `json:"original_file_name"`
ArchivedFileName string `json:"archived_file_name"`
Owner int `json:"owner"`
UserCanChange bool `json:"user_can_change"`
Notes []interface{} `json:"notes"`
SearchHit struct {
Score float64 `json:"score"`
Highlights string `json:"highlights"`
NoteHighlights string `json:"note_highlights"`
Rank int `json:"rank"`
} `json:"__search_hit__"`
}{
{
ID: 1,
Title: "Document 1",
Content: "Content 1",
Tags: []int{1, 2},
},
{
ID: 2,
Title: "Document 2",
Content: "Content 2",
Tags: []int{2, 3},
},
},
}
// Mock data for tags
tagsResponse := map[string]interface{}{
"results": []map[string]interface{}{
{"id": 1, "name": "tag1"},
{"id": 2, "name": "tag2"},
{"id": 3, "name": "tag3"},
},
"next": nil,
}
// Set mock responses
env.setMockResponse("/api/documents/", func(w http.ResponseWriter, r *http.Request) {
// Verify query parameters
expectedQuery := "query=tag:tag1+tag:tag2"
assert.Equal(t, expectedQuery, r.URL.RawQuery)
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(documentsResponse)
})
env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(tagsResponse)
})
ctx := context.Background()
tags := []string{"tag1", "tag2"}
documents, err := env.client.GetDocumentsByTags(ctx, tags)
require.NoError(t, err)
expectedDocuments := []Document{
{
ID: 1,
Title: "Document 1",
Content: "Content 1",
Tags: []string{"tag1", "tag2"},
},
{
ID: 2,
Title: "Document 2",
Content: "Content 2",
Tags: []string{"tag2", "tag3"},
},
}
assert.Equal(t, expectedDocuments, documents)
}
// TestDownloadPDF tests the DownloadPDF method
func TestDownloadPDF(t *testing.T) {
env := newTestEnv(t)
defer env.teardown()
document := Document{
ID: 123,
}
// Get sample PDF from tests/pdf/sample.pdf
pdfFile := "tests/pdf/sample.pdf"
pdfContent, err := os.ReadFile(pdfFile)
require.NoError(t, err)
// Set mock response
downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID)
env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(pdfContent)
})
ctx := context.Background()
data, err := env.client.DownloadPDF(ctx, document)
require.NoError(t, err)
assert.Equal(t, pdfContent, data)
}
// TestUpdateDocuments tests the UpdateDocuments method
func TestUpdateDocuments(t *testing.T) {
env := newTestEnv(t)
defer env.teardown()
// Mock data for documents to update
documents := []DocumentSuggestion{
{
ID: 1,
OriginalDocument: Document{
ID: 1,
Title: "Old Title",
Tags: []string{"tag1"},
},
SuggestedTitle: "New Title",
SuggestedTags: []string{"tag2"},
},
}
// Mock data for tags
tagsResponse := map[string]interface{}{
"results": []map[string]interface{}{
{"id": 1, "name": "tag1"},
{"id": 2, "name": "tag2"},
{"id": 3, "name": "manual"},
},
"next": nil,
}
// Set the manual tag
manualTag = "manual"
// Set mock responses
env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(tagsResponse)
})
updatePath := fmt.Sprintf("/api/documents/%d/", documents[0].ID)
env.setMockResponse(updatePath, func(w http.ResponseWriter, r *http.Request) {
// Verify the request method
assert.Equal(t, "PATCH", r.Method)
// Read and parse the request body
bodyBytes, err := io.ReadAll(r.Body)
require.NoError(t, err)
defer r.Body.Close()
var updatedFields map[string]interface{}
err = json.Unmarshal(bodyBytes, &updatedFields)
require.NoError(t, err)
// Expected updated fields
expectedFields := map[string]interface{}{
"title": "New Title",
"tags": []interface{}{float64(2)}, // tag2 ID
}
assert.Equal(t, expectedFields, updatedFields)
w.WriteHeader(http.StatusOK)
})
ctx := context.Background()
err := env.client.UpdateDocuments(ctx, documents)
require.NoError(t, err)
}
// TestUrlEncode tests the urlEncode function
func TestUrlEncode(t *testing.T) {
input := "tag:tag1 tag:tag2"
expected := "tag:tag1+tag:tag2"
result := urlEncode(input)
assert.Equal(t, expected, result)
}
// TestDownloadDocumentAsImages tests the DownloadDocumentAsImages method
func TestDownloadDocumentAsImages(t *testing.T) {
env := newTestEnv(t)
defer env.teardown()
document := Document{
ID: 123,
}
// Get sample PDF from tests/pdf/sample.pdf
pdfFile := "tests/pdf/sample.pdf"
pdfContent, err := os.ReadFile(pdfFile)
require.NoError(t, err)
// Set mock response
downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID)
env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(pdfContent)
})
ctx := context.Background()
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document)
require.NoError(t, err)
// Verify that exatly one page was extracted
assert.Len(t, imagePaths, 1)
// The path shall end with paperless-gpt/document-123/page000.jpg
assert.Contains(t, imagePaths[0], "paperless-gpt/document-123/page000.jpg")
for _, imagePath := range imagePaths {
_, err := os.Stat(imagePath)
assert.NoError(t, err)
}
}
func TestDownloadDocumentAsImages_ManyPages(t *testing.T) {
env := newTestEnv(t)
defer env.teardown()
document := Document{
ID: 321,
}
// Get sample PDF from tests/pdf/sample.pdf
pdfFile := "tests/pdf/many-pages.pdf"
pdfContent, err := os.ReadFile(pdfFile)
require.NoError(t, err)
// Set mock response
downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID)
env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(pdfContent)
})
ctx := context.Background()
env.client.CacheFolder = "tests/tmp"
// Clean the cache folder
os.RemoveAll(env.client.CacheFolder)
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document)
require.NoError(t, err)
// Verify that exatly 52 pages were extracted
assert.Len(t, imagePaths, 52)
// The path shall end with tests/tmp/document-321/page000.jpg
for _, imagePath := range imagePaths {
_, err := os.Stat(imagePath)
assert.NoError(t, err)
assert.Contains(t, imagePath, "tests/tmp/document-321/page")
}
}

BIN
tests/pdf/many-pages.pdf Normal file

Binary file not shown.

BIN
tests/pdf/sample.pdf Normal file

Binary file not shown.