diff --git a/.gitignore b/.gitignore index f679422..a3f42a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .env .DS_Store -prompts/ \ No newline at end of file +prompts/ +tests/tmp \ No newline at end of file diff --git a/go.mod b/go.mod index 698838e..0600ba0 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,9 @@ toolchain go1.22.2 require ( github.com/Masterminds/sprig/v3 v3.2.3 + github.com/gen2brain/go-fitz v1.24.14 github.com/gin-gonic/gin v1.10.0 + github.com/stretchr/testify v1.9.0 github.com/tmc/langchaingo v0.1.12 ) @@ -17,7 +19,9 @@ require ( github.com/bytedance/sonic/loader v0.1.1 // indirect github.com/cloudwego/base64x v0.1.4 // indirect github.com/cloudwego/iasm v0.2.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect github.com/dlclark/regexp2 v1.10.0 // indirect + github.com/ebitengine/purego v0.8.0 // indirect github.com/gabriel-vasile/mimetype v1.4.3 // indirect github.com/gin-contrib/sse v0.1.0 // indirect github.com/go-playground/locales v0.14.1 // indirect @@ -28,6 +32,7 @@ require ( github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.13 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/jupiterrider/ffi v0.2.0 // indirect github.com/klauspost/cpuid/v2 v2.2.7 // indirect github.com/leodido/go-urn v1.4.0 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -37,6 +42,7 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect github.com/pkoukk/tiktoken-go v0.1.6 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect github.com/shopspring/decimal v1.2.0 // indirect github.com/spf13/cast v1.3.1 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect diff --git a/go.sum b/go.sum index 39f6dd5..14b1880 100644 --- a/go.sum +++ b/go.sum @@ -17,8 +17,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0= github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/ebitengine/purego v0.8.0 h1:JbqvnEzRvPpxhCJzJJ2y0RbiZ8nyjccVUrSM3q+GvvE= +github.com/ebitengine/purego v0.8.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0= github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= +github.com/gen2brain/go-fitz v1.24.14 h1:09weRkjVtLYNGo7l0J7DyOwBExbwi8SJ9h8YPhw9WEo= +github.com/gen2brain/go-fitz v1.24.14/go.mod h1:0KaZeQgASc20Yp5R/pFzyy7SmP01XcoHKNF842U2/S4= github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE= github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU= @@ -46,6 +50,8 @@ github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/jupiterrider/ffi v0.2.0 h1:tMM70PexgYNmV+WyaYhJgCvQAvtTCs3wXeILPutihnA= +github.com/jupiterrider/ffi v0.2.0/go.mod h1:yqYqX5DdEccAsHeMn+6owkoI2llBLySVAF8dwCDZPVs= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= diff --git a/paperless.go b/paperless.go index ee28b52..dc38109 100644 --- a/paperless.go +++ b/paperless.go @@ -5,17 +5,23 @@ import ( "context" "encoding/json" "fmt" + "image/jpeg" "io" "log" "net/http" + "os" + "path/filepath" "strings" + + "github.com/gen2brain/go-fitz" ) // PaperlessClient struct to interact with the Paperless-NGX API type PaperlessClient struct { - BaseURL string - APIToken string - HTTPClient *http.Client + BaseURL string + APIToken string + HTTPClient *http.Client + CacheFolder string } // NewPaperlessClient creates a new instance of PaperlessClient with a default HTTP client @@ -239,6 +245,112 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum return nil } +// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images +func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document Document) ([]string, error) { + // Create a directory named after the document ID + docDir := filepath.Join(c.GetCacheFolder(), fmt.Sprintf("/document-%d", document.ID)) + if _, err := os.Stat(docDir); os.IsNotExist(err) { + err = os.MkdirAll(docDir, 0755) + if err != nil { + return nil, err + } + } + + // Check if images already exist + var imagePaths []string + for n := 0; ; n++ { + imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n)) + if _, err := os.Stat(imagePath); os.IsNotExist(err) { + break + } + imagePaths = append(imagePaths, imagePath) + } + + // If images exist, return them + if len(imagePaths) > 0 { + return imagePaths, nil + } + + // Proceed with downloading and converting the document to images + path := fmt.Sprintf("api/documents/%d/download/", document.ID) + resp, err := c.Do(ctx, "GET", path, nil) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + bodyBytes, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("error downloading document %d: %d, %s", document.ID, resp.StatusCode, string(bodyBytes)) + } + + pdfData, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + tmpFile, err := os.CreateTemp("", "document-*.pdf") + if err != nil { + return nil, err + } + defer os.Remove(tmpFile.Name()) + + _, err = tmpFile.Write(pdfData) + if err != nil { + return nil, err + } + tmpFile.Close() + + doc, err := fitz.New(tmpFile.Name()) + if err != nil { + return nil, err + } + defer doc.Close() + + for n := 0; n < doc.NumPage(); n++ { + img, err := doc.Image(n) + if err != nil { + return nil, err + } + + imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n)) + f, err := os.Create(imagePath) + if err != nil { + return nil, err + } + + err = jpeg.Encode(f, img, &jpeg.Options{Quality: jpeg.DefaultQuality}) + if err != nil { + return nil, err + } + f.Close() + + // Verify the JPEG file + file, err := os.Open(imagePath) + if err != nil { + return nil, err + } + defer file.Close() + + _, err = jpeg.Decode(file) + if err != nil { + return nil, fmt.Errorf("invalid JPEG file: %s", imagePath) + } + + imagePaths = append(imagePaths, imagePath) + } + + return imagePaths, nil +} + +// GetCacheFolder returns the cache folder for the PaperlessClient +func (c *PaperlessClient) GetCacheFolder() string { + if c.CacheFolder == "" { + c.CacheFolder = filepath.Join(os.TempDir(), "paperless-gpt") + } + return c.CacheFolder +} + // urlEncode encodes a string for safe URL usage func urlEncode(s string) string { return strings.ReplaceAll(s, " ", "+") diff --git a/paperless_test.go b/paperless_test.go new file mode 100644 index 0000000..52302aa --- /dev/null +++ b/paperless_test.go @@ -0,0 +1,412 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/http/httptest" + "os" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Helper struct to hold common test data and methods +type testEnv struct { + t *testing.T + server *httptest.Server + client *PaperlessClient + requestCount int + mockResponses map[string]http.HandlerFunc +} + +// newTestEnv initializes a new test environment +func newTestEnv(t *testing.T) *testEnv { + env := &testEnv{ + t: t, + mockResponses: make(map[string]http.HandlerFunc), + } + + // Create a mock server with a handler that dispatches based on URL path + env.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + env.requestCount++ + handler, exists := env.mockResponses[r.URL.Path] + if !exists { + t.Fatalf("Unexpected request URL: %s", r.URL.Path) + } + // Set common headers and invoke the handler + assert.Equal(t, "Token test-token", r.Header.Get("Authorization")) + handler(w, r) + })) + + // Initialize the PaperlessClient with the mock server URL + env.client = NewPaperlessClient(env.server.URL, "test-token") + env.client.HTTPClient = env.server.Client() + + return env +} + +// teardown closes the mock server +func (env *testEnv) teardown() { + env.server.Close() +} + +// Helper method to set a mock response for a specific path +func (env *testEnv) setMockResponse(path string, handler http.HandlerFunc) { + env.mockResponses[path] = handler +} + +// TestNewPaperlessClient tests the creation of a new PaperlessClient instance +func TestNewPaperlessClient(t *testing.T) { + baseURL := "http://example.com" + apiToken := "test-token" + + client := NewPaperlessClient(baseURL, apiToken) + + assert.Equal(t, "http://example.com", client.BaseURL) + assert.Equal(t, apiToken, client.APIToken) + assert.NotNil(t, client.HTTPClient) +} + +// TestDo tests the Do method of PaperlessClient +func TestDo(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + // Set mock response for "/test-path" + env.setMockResponse("/test-path", func(w http.ResponseWriter, r *http.Request) { + // Verify the request method + assert.Equal(t, "GET", r.Method) + // Send a mock response + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"message": "success"}`)) + }) + + ctx := context.Background() + resp, err := env.client.Do(ctx, "GET", "/test-path", nil) + require.NoError(t, err) + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + assert.Equal(t, http.StatusOK, resp.StatusCode) + assert.Equal(t, `{"message": "success"}`, string(body)) +} + +// TestGetAllTags tests the GetAllTags method, including pagination +func TestGetAllTags(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + // Mock data for paginated responses + page1 := map[string]interface{}{ + "results": []map[string]interface{}{ + {"id": 1, "name": "tag1"}, + {"id": 2, "name": "tag2"}, + }, + "next": fmt.Sprintf("%s/api/tags/?page=2", env.server.URL), + } + page2 := map[string]interface{}{ + "results": []map[string]interface{}{ + {"id": 3, "name": "tag3"}, + }, + "next": nil, + } + + // Set mock responses for pagination + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + query := r.URL.Query().Get("page") + if query == "2" { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(page2) + } else { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(page1) + } + }) + + ctx := context.Background() + tags, err := env.client.GetAllTags(ctx) + require.NoError(t, err) + + expectedTags := map[string]int{ + "tag1": 1, + "tag2": 2, + "tag3": 3, + } + + assert.Equal(t, expectedTags, tags) +} + +// TestGetDocumentsByTags tests the GetDocumentsByTags method +func TestGetDocumentsByTags(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + // Mock data for documents + documentsResponse := GetDocumentsApiResponse{ + Results: []struct { + ID int `json:"id"` + Correspondent interface{} `json:"correspondent"` + DocumentType interface{} `json:"document_type"` + StoragePath interface{} `json:"storage_path"` + Title string `json:"title"` + Content string `json:"content"` + Tags []int `json:"tags"` + Created time.Time `json:"created"` + CreatedDate string `json:"created_date"` + Modified time.Time `json:"modified"` + Added time.Time `json:"added"` + ArchiveSerialNumber interface{} `json:"archive_serial_number"` + OriginalFileName string `json:"original_file_name"` + ArchivedFileName string `json:"archived_file_name"` + Owner int `json:"owner"` + UserCanChange bool `json:"user_can_change"` + Notes []interface{} `json:"notes"` + SearchHit struct { + Score float64 `json:"score"` + Highlights string `json:"highlights"` + NoteHighlights string `json:"note_highlights"` + Rank int `json:"rank"` + } `json:"__search_hit__"` + }{ + { + ID: 1, + Title: "Document 1", + Content: "Content 1", + Tags: []int{1, 2}, + }, + { + ID: 2, + Title: "Document 2", + Content: "Content 2", + Tags: []int{2, 3}, + }, + }, + } + + // Mock data for tags + tagsResponse := map[string]interface{}{ + "results": []map[string]interface{}{ + {"id": 1, "name": "tag1"}, + {"id": 2, "name": "tag2"}, + {"id": 3, "name": "tag3"}, + }, + "next": nil, + } + + // Set mock responses + env.setMockResponse("/api/documents/", func(w http.ResponseWriter, r *http.Request) { + // Verify query parameters + expectedQuery := "query=tag:tag1+tag:tag2" + assert.Equal(t, expectedQuery, r.URL.RawQuery) + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(documentsResponse) + }) + + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(tagsResponse) + }) + + ctx := context.Background() + tags := []string{"tag1", "tag2"} + documents, err := env.client.GetDocumentsByTags(ctx, tags) + require.NoError(t, err) + + expectedDocuments := []Document{ + { + ID: 1, + Title: "Document 1", + Content: "Content 1", + Tags: []string{"tag1", "tag2"}, + }, + { + ID: 2, + Title: "Document 2", + Content: "Content 2", + Tags: []string{"tag2", "tag3"}, + }, + } + + assert.Equal(t, expectedDocuments, documents) +} + +// TestDownloadPDF tests the DownloadPDF method +func TestDownloadPDF(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + document := Document{ + ID: 123, + } + + // Get sample PDF from tests/pdf/sample.pdf + pdfFile := "tests/pdf/sample.pdf" + pdfContent, err := os.ReadFile(pdfFile) + require.NoError(t, err) + + // Set mock response + downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID) + env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write(pdfContent) + }) + + ctx := context.Background() + data, err := env.client.DownloadPDF(ctx, document) + require.NoError(t, err) + assert.Equal(t, pdfContent, data) +} + +// TestUpdateDocuments tests the UpdateDocuments method +func TestUpdateDocuments(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + // Mock data for documents to update + documents := []DocumentSuggestion{ + { + ID: 1, + OriginalDocument: Document{ + ID: 1, + Title: "Old Title", + Tags: []string{"tag1"}, + }, + SuggestedTitle: "New Title", + SuggestedTags: []string{"tag2"}, + }, + } + // Mock data for tags + tagsResponse := map[string]interface{}{ + "results": []map[string]interface{}{ + {"id": 1, "name": "tag1"}, + {"id": 2, "name": "tag2"}, + {"id": 3, "name": "manual"}, + }, + "next": nil, + } + + // Set the manual tag + manualTag = "manual" + + // Set mock responses + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(tagsResponse) + }) + + updatePath := fmt.Sprintf("/api/documents/%d/", documents[0].ID) + env.setMockResponse(updatePath, func(w http.ResponseWriter, r *http.Request) { + // Verify the request method + assert.Equal(t, "PATCH", r.Method) + + // Read and parse the request body + bodyBytes, err := io.ReadAll(r.Body) + require.NoError(t, err) + defer r.Body.Close() + + var updatedFields map[string]interface{} + err = json.Unmarshal(bodyBytes, &updatedFields) + require.NoError(t, err) + + // Expected updated fields + expectedFields := map[string]interface{}{ + "title": "New Title", + "tags": []interface{}{float64(2)}, // tag2 ID + } + + assert.Equal(t, expectedFields, updatedFields) + + w.WriteHeader(http.StatusOK) + }) + + ctx := context.Background() + err := env.client.UpdateDocuments(ctx, documents) + require.NoError(t, err) +} + +// TestUrlEncode tests the urlEncode function +func TestUrlEncode(t *testing.T) { + input := "tag:tag1 tag:tag2" + expected := "tag:tag1+tag:tag2" + result := urlEncode(input) + assert.Equal(t, expected, result) +} + +// TestDownloadDocumentAsImages tests the DownloadDocumentAsImages method +func TestDownloadDocumentAsImages(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + document := Document{ + ID: 123, + } + + // Get sample PDF from tests/pdf/sample.pdf + pdfFile := "tests/pdf/sample.pdf" + pdfContent, err := os.ReadFile(pdfFile) + require.NoError(t, err) + + // Set mock response + downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID) + env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write(pdfContent) + }) + + ctx := context.Background() + imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document) + require.NoError(t, err) + + // Verify that exatly one page was extracted + assert.Len(t, imagePaths, 1) + // The path shall end with paperless-gpt/document-123/page000.jpg + assert.Contains(t, imagePaths[0], "paperless-gpt/document-123/page000.jpg") + for _, imagePath := range imagePaths { + _, err := os.Stat(imagePath) + assert.NoError(t, err) + } +} + +func TestDownloadDocumentAsImages_ManyPages(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + document := Document{ + ID: 321, + } + + // Get sample PDF from tests/pdf/sample.pdf + pdfFile := "tests/pdf/many-pages.pdf" + pdfContent, err := os.ReadFile(pdfFile) + require.NoError(t, err) + + // Set mock response + downloadPath := fmt.Sprintf("/api/documents/%d/download/", document.ID) + env.setMockResponse(downloadPath, func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write(pdfContent) + }) + + ctx := context.Background() + env.client.CacheFolder = "tests/tmp" + // Clean the cache folder + os.RemoveAll(env.client.CacheFolder) + imagePaths, err := env.client.DownloadDocumentAsImages(ctx, document) + require.NoError(t, err) + + // Verify that exatly 52 pages were extracted + assert.Len(t, imagePaths, 52) + // The path shall end with tests/tmp/document-321/page000.jpg + for _, imagePath := range imagePaths { + _, err := os.Stat(imagePath) + assert.NoError(t, err) + assert.Contains(t, imagePath, "tests/tmp/document-321/page") + } +} diff --git a/tests/pdf/many-pages.pdf b/tests/pdf/many-pages.pdf new file mode 100644 index 0000000..5f3bc06 Binary files /dev/null and b/tests/pdf/many-pages.pdf differ diff --git a/tests/pdf/sample.pdf b/tests/pdf/sample.pdf new file mode 100644 index 0000000..3d71959 Binary files /dev/null and b/tests/pdf/sample.pdf differ