mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-13 13:18:02 -05:00
feat: Download PDFs as images
This commit is contained in:
parent
10df151525
commit
8c7bd7273f
7 changed files with 541 additions and 4 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,3 +1,4 @@
|
|||
.env
|
||||
.DS_Store
|
||||
prompts/
|
||||
tests/tmp
|
6
go.mod
6
go.mod
|
@ -6,7 +6,9 @@ toolchain go1.22.2
|
|||
|
||||
require (
|
||||
github.com/Masterminds/sprig/v3 v3.2.3
|
||||
github.com/gen2brain/go-fitz v1.24.14
|
||||
github.com/gin-gonic/gin v1.10.0
|
||||
github.com/stretchr/testify v1.9.0
|
||||
github.com/tmc/langchaingo v0.1.12
|
||||
)
|
||||
|
||||
|
@ -17,7 +19,9 @@ require (
|
|||
github.com/bytedance/sonic/loader v0.1.1 // indirect
|
||||
github.com/cloudwego/base64x v0.1.4 // indirect
|
||||
github.com/cloudwego/iasm v0.2.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dlclark/regexp2 v1.10.0 // indirect
|
||||
github.com/ebitengine/purego v0.8.0 // indirect
|
||||
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
|
||||
github.com/gin-contrib/sse v0.1.0 // indirect
|
||||
github.com/go-playground/locales v0.14.1 // indirect
|
||||
|
@ -28,6 +32,7 @@ require (
|
|||
github.com/huandu/xstrings v1.3.3 // indirect
|
||||
github.com/imdario/mergo v0.3.13 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/jupiterrider/ffi v0.2.0 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
|
||||
github.com/leodido/go-urn v1.4.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
|
@ -37,6 +42,7 @@ require (
|
|||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
|
||||
github.com/pkoukk/tiktoken-go v0.1.6 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/shopspring/decimal v1.2.0 // indirect
|
||||
github.com/spf13/cast v1.3.1 // indirect
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
||||
|
|
6
go.sum
6
go.sum
|
@ -17,8 +17,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
|
|||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0=
|
||||
github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
|
||||
github.com/ebitengine/purego v0.8.0 h1:JbqvnEzRvPpxhCJzJJ2y0RbiZ8nyjccVUrSM3q+GvvE=
|
||||
github.com/ebitengine/purego v0.8.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
|
||||
github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
|
||||
github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
|
||||
github.com/gen2brain/go-fitz v1.24.14 h1:09weRkjVtLYNGo7l0J7DyOwBExbwi8SJ9h8YPhw9WEo=
|
||||
github.com/gen2brain/go-fitz v1.24.14/go.mod h1:0KaZeQgASc20Yp5R/pFzyy7SmP01XcoHKNF842U2/S4=
|
||||
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
|
||||
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
|
||||
github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
|
||||
|
@ -46,6 +50,8 @@ github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk=
|
|||
github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/jupiterrider/ffi v0.2.0 h1:tMM70PexgYNmV+WyaYhJgCvQAvtTCs3wXeILPutihnA=
|
||||
github.com/jupiterrider/ffi v0.2.0/go.mod h1:yqYqX5DdEccAsHeMn+6owkoI2llBLySVAF8dwCDZPVs=
|
||||
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
|
||||
github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
|
||||
github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
|
||||
|
|
118
paperless.go
118
paperless.go
|
@ -5,17 +5,23 @@ import (
|
|||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"image/jpeg"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/gen2brain/go-fitz"
|
||||
)
|
||||
|
||||
// PaperlessClient struct to interact with the Paperless-NGX API
|
||||
type PaperlessClient struct {
|
||||
BaseURL string
|
||||
APIToken string
|
||||
HTTPClient *http.Client
|
||||
BaseURL string
|
||||
APIToken string
|
||||
HTTPClient *http.Client
|
||||
CacheFolder string
|
||||
}
|
||||
|
||||
// NewPaperlessClient creates a new instance of PaperlessClient with a default HTTP client
|
||||
|
@ -239,6 +245,112 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum
|
|||
return nil
|
||||
}
|
||||
|
||||
// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
|
||||
func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document Document) ([]string, error) {
|
||||
// Create a directory named after the document ID
|
||||
docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", document.ID))
|
||||
if _, err := os.Stat(docDir); os.IsNotExist(err) {
|
||||
err = os.MkdirAll(docDir, 0755)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Check if images already exist
|
||||
var imagePaths []string
|
||||
for n := 0; ; n++ {
|
||||
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
|
||||
if _, err := os.Stat(imagePath); os.IsNotExist(err) {
|
||||
break
|
||||
}
|
||||
imagePaths = append(imagePaths, imagePath)
|
||||
}
|
||||
|
||||
// If images exist, return them
|
||||
if len(imagePaths) > 0 {
|
||||
return imagePaths, nil
|
||||
}
|
||||
|
||||
// Proceed with downloading and converting the document to images
|
||||
path := fmt.Sprintf("api/documents/%d/download/", document.ID)
|
||||
resp, err := c.Do(ctx, "GET", path, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
bodyBytes, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("error downloading document %d: %d, %s", document.ID, resp.StatusCode, string(bodyBytes))
|
||||
}
|
||||
|
||||
pdfData, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tmpFile, err := os.CreateTemp("", "document-*.pdf")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer os.Remove(tmpFile.Name())
|
||||
|
||||
_, err = tmpFile.Write(pdfData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tmpFile.Close()
|
||||
|
||||
doc, err := fitz.New(tmpFile.Name())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer doc.Close()
|
||||
|
||||
for n := 0; n < doc.NumPage(); n++ {
|
||||
img, err := doc.Image(n)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
|
||||
f, err := os.Create(imagePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
err = jpeg.Encode(f, img, &jpeg.Options{Quality: jpeg.DefaultQuality})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
f.Close()
|
||||
|
||||
// Verify the JPEG file
|
||||
file, err := os.Open(imagePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
_, err = jpeg.Decode(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid JPEG file: %s", imagePath)
|
||||
}
|
||||
|
||||
imagePaths = append(imagePaths, imagePath)
|
||||
}
|
||||
|
||||
return imagePaths, nil
|
||||
}
|
||||
|
||||
// GetCacheFolder returns the cache folder for the PaperlessClient
|
||||
func (c *PaperlessClient) GetCacheFolder() string {
|
||||
if c.CacheFolder == "" {
|
||||
c.CacheFolder = filepath.Join(os.TempDir(), "paperless-gpt")
|
||||
}
|
||||
return c.CacheFolder
|
||||
}
|
||||
|
||||
// urlEncode encodes a string for safe URL usage
|
||||
func urlEncode(s string) string {
|
||||
return strings.ReplaceAll(s, " ", "+")
|
||||
|
|
412
paperless_test.go
Normal file
412
paperless_test.go
Normal file
|
@ -0,0 +1,412 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// Helper struct to hold common test data and methods
|
||||
type testEnv struct {
|
||||
t *testing.T
|
||||
server *httptest.Server
|
||||
client *PaperlessClient
|
||||
requestCount int
|
||||
mockResponses map[string]http.HandlerFunc
|
||||
}
|
||||
|
||||
// newTestEnv initializes a new test environment
|
||||
func newTestEnv(t *testing.T) *testEnv {
|
||||
env := &testEnv{
|
||||
t: t,
|
||||
mockResponses: make(map[string]http.HandlerFunc),
|
||||
}
|
||||
|
||||
// Create a mock server with a handler that dispatches based on URL path
|
||||
env.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
env.requestCount++
|
||||
handler, exists := env.mockResponses[r.URL.Path]
|
||||
if !exists {
|
||||
t.Fatalf("Unexpected request URL: %s", r.URL.Path)
|
||||
}
|
||||
// Set common headers and invoke the handler
|
||||
assert.Equal(t, "Token test-token", r.Header.Get("Authorization"))
|
||||
handler(w, r)
|
||||
}))
|
||||
|
||||
// Initialize the PaperlessClient with the mock server URL
|
||||
env.client = NewPaperlessClient(env.server.URL, "test-token")
|
||||
env.client.HTTPClient = env.server.Client()
|
||||
|
||||
return env
|
||||
}
|
||||
|
||||
// teardown closes the mock server
|
||||
func (env *testEnv) teardown() {
|
||||
env.server.Close()
|
||||
}
|
||||
|
||||
// Helper method to set a mock response for a specific path
|
||||
func (env *testEnv) setMockResponse(path string, handler http.HandlerFunc) {
|
||||
env.mockResponses[path] = handler
|
||||
}
|
||||
|
||||
// TestNewPaperlessClient tests the creation of a new PaperlessClient instance
|
||||
func TestNewPaperlessClient(t *testing.T) {
|
||||
baseURL := "http://example.com"
|
||||
apiToken := "test-token"
|
||||
|
||||
client := NewPaperlessClient(baseURL, apiToken)
|
||||
|
||||
assert.Equal(t, "http://example.com", client.BaseURL)
|
||||
assert.Equal(t, apiToken, client.APIToken)
|
||||
assert.NotNil(t, client.HTTPClient)
|
||||
}
|
||||
|
||||
// TestDo tests the Do method of PaperlessClient
|
||||
func TestDo(t *testing.T) {
|
||||
env := newTestEnv(t)
|
||||
defer env.teardown()
|
||||
|
||||
// Set mock response for "/test-path"
|
||||
env.setMockResponse("/test-path", func(w http.ResponseWriter, r *http.Request) {
|
||||
// Verify the request method
|
||||
assert.Equal(t, "GET", r.Method)
|
||||
// Send a mock response
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte(`{"message": "success"}`))
|
||||
})
|
||||
|
||||
ctx := context.Background()
|
||||
resp, err := env.client.Do(ctx, "GET", "/test-path", nil)
|
||||
require.NoError(t, err)
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, http.StatusOK, resp.StatusCode)
|
||||
assert.Equal(t, `{"message": "success"}`, string(body))
|
||||
}
|
||||
|
||||
// TestGetAllTags tests the GetAllTags method, including pagination
|
||||
func TestGetAllTags(t *testing.T) {
|
||||
env := newTestEnv(t)
|
||||
defer env.teardown()
|
||||
|
||||
// Mock data for paginated responses
|
||||
page1 := map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{"id": 1, "name": "tag1"},
|
||||
{"id": 2, "name": "tag2"},
|
||||
},
|
||||
"next": fmt.Sprintf("%s/api/tags/?page=2", env.server.URL),
|
||||
}
|
||||
page2 := map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{"id": 3, "name": "tag3"},
|
||||
},
|
||||
"next": nil,
|
||||
}
|
||||
|
||||
// Set mock responses for pagination
|
||||
env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) {
|
||||
query := r.URL.Query().Get("page")
|
||||
if query == "2" {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(page2)
|
||||
} else {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(page1)
|
||||
}
|
||||
})
|
||||
|
||||
ctx := context.Background()
|
||||
tags, err := env.client.GetAllTags(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedTags := map[string]int{
|
||||
"tag1": 1,
|
||||
"tag2": 2,
|
||||
"tag3": 3,
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedTags, tags)
|
||||
}
|
||||
|
||||
// TestGetDocumentsByTags tests the GetDocumentsByTags method
|
||||
func TestGetDocumentsByTags(t *testing.T) {
|
||||
env := newTestEnv(t)
|
||||
defer env.teardown()
|
||||
|
||||
// Mock data for documents
|
||||
documentsResponse := GetDocumentsApiResponse{
|
||||
Results: []struct {
|
||||
ID int `json:"id"`
|
||||
Correspondent interface{} `json:"correspondent"`
|
||||
DocumentType interface{} `json:"document_type"`
|
||||
StoragePath interface{} `json:"storage_path"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
Tags []int `json:"tags"`
|
||||
Created time.Time `json:"created"`
|
||||
CreatedDate string `json:"created_date"`
|
||||
Modified time.Time `json:"modified"`
|
||||
Added time.Time `json:"added"`
|
||||
ArchiveSerialNumber interface{} `json:"archive_serial_number"`
|
||||
OriginalFileName string `json:"original_file_name"`
|
||||
ArchivedFileName string `json:"archived_file_name"`
|
||||
Owner int `json:"owner"`
|
||||
UserCanChange bool `json:"user_can_change"`
|
||||
Notes []interface{} `json:"notes"`
|
||||
SearchHit struct {
|
||||
Score float64 `json:"score"`
|
||||
Highlights string `json:"highlights"`
|
||||
NoteHighlights string `json:"note_highlights"`
|
||||
Rank int `json:"rank"`
|
||||
} `json:"__search_hit__"`
|
||||
}{
|
||||
{
|
||||
ID: 1,
|
||||
Title: "Document 1",
|
||||
Content: "Content 1",
|
||||
Tags: []int{1, 2},
|
||||
},
|
||||
{
|
||||
ID: 2,
|
||||
Title: "Document 2",
|
||||
Content: "Content 2",
|
||||
Tags: []int{2, 3},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Mock data for tags
|
||||
tagsResponse := map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{"id": 1, "name": "tag1"},
|
||||
{"id": 2, "name": "tag2"},
|
||||
{"id": 3, "name": "tag3"},
|
||||
},
|
||||
"next": nil,
|
||||
}
|
||||
|
||||
// Set mock responses
|
||||
env.setMockResponse("/api/documents/", func(w http.ResponseWriter, r *http.Request) {
|
||||
// Verify query parameters
|
||||
expectedQuery := "query=tag:tag1+tag:tag2"
|
||||
assert.Equal(t, expectedQuery, r.URL.RawQuery)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(documentsResponse)
|
||||
})
|
||||
|
||||
env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(tagsResponse)
|
||||
})
|
||||
|
||||
ctx := context.Background()
|
||||
tags := []string{"tag1", "tag2"}
|
||||
documents, err := env.client.GetDocumentsByTags(ctx, tags)
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedDocuments := []Document{
|
||||
{
|
||||
ID: 1,
|
||||
Title: "Document 1",
|
||||
Content: "Content 1",
|
||||
Tags: []string{"tag1", "tag2"},
|
||||
},
|
||||
{
|
||||
ID: 2,
|
||||
Title: "Document 2",
|
||||
Content: "Content 2",
|
||||
Tags: []string{"tag2", "tag3"},
|
||||
},
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedDocuments, documents)
|
||||
}
|
||||
|
||||
// TestDownloadPDF tests the DownloadPDF method
|
||||
func TestDownloadPDF(t *testing.T) {
|
||||
env := newTestEnv(t)
|
||||
defer env.teardown()
|
||||
|
||||
document := Document{
|
||||
ID: 123,
|
||||
}
|
||||
|
||||
// Get sample PDF from tests/pdf/sample.pdf
|
||||
pdfFile := "tests/pdf/sample.pdf"
|
||||
pdfContent, err := os.ReadFile(pdfFile)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Set mock response
|
||||
downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID)
|
||||
env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write(pdfContent)
|
||||
})
|
||||
|
||||
ctx := context.Background()
|
||||
data, err := env.client.DownloadPDF(ctx, document)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, pdfContent, data)
|
||||
}
|
||||
|
||||
// TestUpdateDocuments tests the UpdateDocuments method
|
||||
func TestUpdateDocuments(t *testing.T) {
|
||||
env := newTestEnv(t)
|
||||
defer env.teardown()
|
||||
|
||||
// Mock data for documents to update
|
||||
documents := []DocumentSuggestion{
|
||||
{
|
||||
ID: 1,
|
||||
OriginalDocument: Document{
|
||||
ID: 1,
|
||||
Title: "Old Title",
|
||||
Tags: []string{"tag1"},
|
||||
},
|
||||
SuggestedTitle: "New Title",
|
||||
SuggestedTags: []string{"tag2"},
|
||||
},
|
||||
}
|
||||
// Mock data for tags
|
||||
tagsResponse := map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{"id": 1, "name": "tag1"},
|
||||
{"id": 2, "name": "tag2"},
|
||||
{"id": 3, "name": "manual"},
|
||||
},
|
||||
"next": nil,
|
||||
}
|
||||
|
||||
// Set the manual tag
|
||||
manualTag = "manual"
|
||||
|
||||
// Set mock responses
|
||||
env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(tagsResponse)
|
||||
})
|
||||
|
||||
updatePath := fmt.Sprintf("/api/documents/%d/", documents[0].ID)
|
||||
env.setMockResponse(updatePath, func(w http.ResponseWriter, r *http.Request) {
|
||||
// Verify the request method
|
||||
assert.Equal(t, "PATCH", r.Method)
|
||||
|
||||
// Read and parse the request body
|
||||
bodyBytes, err := io.ReadAll(r.Body)
|
||||
require.NoError(t, err)
|
||||
defer r.Body.Close()
|
||||
|
||||
var updatedFields map[string]interface{}
|
||||
err = json.Unmarshal(bodyBytes, &updatedFields)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Expected updated fields
|
||||
expectedFields := map[string]interface{}{
|
||||
"title": "New Title",
|
||||
"tags": []interface{}{float64(2)}, // tag2 ID
|
||||
}
|
||||
|
||||
assert.Equal(t, expectedFields, updatedFields)
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
})
|
||||
|
||||
ctx := context.Background()
|
||||
err := env.client.UpdateDocuments(ctx, documents)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// TestUrlEncode tests the urlEncode function
|
||||
func TestUrlEncode(t *testing.T) {
|
||||
input := "tag:tag1 tag:tag2"
|
||||
expected := "tag:tag1+tag:tag2"
|
||||
result := urlEncode(input)
|
||||
assert.Equal(t, expected, result)
|
||||
}
|
||||
|
||||
// TestDownloadDocumentAsImages tests the DownloadDocumentAsImages method
|
||||
func TestDownloadDocumentAsImages(t *testing.T) {
|
||||
env := newTestEnv(t)
|
||||
defer env.teardown()
|
||||
|
||||
document := Document{
|
||||
ID: 123,
|
||||
}
|
||||
|
||||
// Get sample PDF from tests/pdf/sample.pdf
|
||||
pdfFile := "tests/pdf/sample.pdf"
|
||||
pdfContent, err := os.ReadFile(pdfFile)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Set mock response
|
||||
downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID)
|
||||
env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write(pdfContent)
|
||||
})
|
||||
|
||||
ctx := context.Background()
|
||||
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify that exatly one page was extracted
|
||||
assert.Len(t, imagePaths, 1)
|
||||
// The path shall end with paperless-gpt/document-123/page000.jpg
|
||||
assert.Contains(t, imagePaths[0], "paperless-gpt/document-123/page000.jpg")
|
||||
for _, imagePath := range imagePaths {
|
||||
_, err := os.Stat(imagePath)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDownloadDocumentAsImages_ManyPages(t *testing.T) {
|
||||
env := newTestEnv(t)
|
||||
defer env.teardown()
|
||||
|
||||
document := Document{
|
||||
ID: 321,
|
||||
}
|
||||
|
||||
// Get sample PDF from tests/pdf/sample.pdf
|
||||
pdfFile := "tests/pdf/many-pages.pdf"
|
||||
pdfContent, err := os.ReadFile(pdfFile)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Set mock response
|
||||
downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID)
|
||||
env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write(pdfContent)
|
||||
})
|
||||
|
||||
ctx := context.Background()
|
||||
env.client.CacheFolder = "tests/tmp"
|
||||
// Clean the cache folder
|
||||
os.RemoveAll(env.client.CacheFolder)
|
||||
imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify that exatly 52 pages were extracted
|
||||
assert.Len(t, imagePaths, 52)
|
||||
// The path shall end with tests/tmp/document-321/page000.jpg
|
||||
for _, imagePath := range imagePaths {
|
||||
_, err := os.Stat(imagePath)
|
||||
assert.NoError(t, err)
|
||||
assert.Contains(t, imagePath, "tests/tmp/document-321/page")
|
||||
}
|
||||
}
|
BIN
tests/pdf/many-pages.pdf
Normal file
BIN
tests/pdf/many-pages.pdf
Normal file
Binary file not shown.
BIN
tests/pdf/sample.pdf
Normal file
BIN
tests/pdf/sample.pdf
Normal file
Binary file not shown.
Loading…
Reference in a new issue