package main import ( "bytes" "context" "crypto/tls" "encoding/json" "fmt" "image/jpeg" "io" "net/http" "net/url" "os" "path/filepath" "slices" "sort" "strings" "sync" "github.com/gen2brain/go-fitz" "github.com/sirupsen/logrus" "golang.org/x/sync/errgroup" "gorm.io/gorm" ) // PaperlessClient struct to interact with the Paperless-NGX API type PaperlessClient struct { BaseURL string APIToken string HTTPClient *http.Client CacheFolder string } func hasSameTags(original, suggested []string) bool { if len(original) != len(suggested) { return false } // Create copies to avoid modifying original slices orig := make([]string, len(original)) sugg := make([]string, len(suggested)) copy(orig, original) copy(sugg, suggested) // Sort both slices sort.Strings(orig) sort.Strings(sugg) // Compare elements for i := range orig { if orig[i] != sugg[i] { return false } } return true } // NewPaperlessClient creates a new instance of PaperlessClient with a default HTTP client func NewPaperlessClient(baseURL, apiToken string) *PaperlessClient { cacheFolder := os.Getenv("PAPERLESS_GPT_CACHE_DIR") // Create a custom HTTP transport with TLS configuration tr := &http.Transport{ TLSClientConfig: &tls.Config{ InsecureSkipVerify: paperlessInsecureSkipVerify, }, } httpClient := &http.Client{Transport: tr} return &PaperlessClient{ BaseURL: strings.TrimRight(baseURL, "/"), APIToken: apiToken, HTTPClient: httpClient, CacheFolder: cacheFolder, } } // Do method to make requests to the Paperless-NGX API func (client *PaperlessClient) Do(ctx context.Context, method, path string, body io.Reader) (*http.Response, error) { url := fmt.Sprintf("%s/%s", client.BaseURL, strings.TrimLeft(path, "/")) req, err := http.NewRequestWithContext(ctx, method, url, body) if err != nil { return nil, err } req.Header.Set("Authorization", fmt.Sprintf("Token %s", client.APIToken)) // Set Content-Type if body is present if body != nil { req.Header.Set("Content-Type", "application/json") } log.WithFields(logrus.Fields{ "method": method, "url": url, }).Debug("Making HTTP request") resp, err := client.HTTPClient.Do(req) if err != nil { log.WithError(err).WithFields(logrus.Fields{ "url": url, "method": method, "error": err, }).Error("HTTP request failed") return nil, fmt.Errorf("HTTP request failed: %w", err) } // Check if response is HTML instead of JSON for API endpoints if strings.HasPrefix(path, "api/") { contentType := resp.Header.Get("Content-Type") if strings.Contains(contentType, "text/html") { bodyBytes, _ := io.ReadAll(resp.Body) resp.Body.Close() // Create a new response with the same body for the caller resp = &http.Response{ Status: resp.Status, StatusCode: resp.StatusCode, Header: resp.Header, Body: io.NopCloser(bytes.NewBuffer(bodyBytes)), } log.WithFields(logrus.Fields{ "url": url, "method": method, "content-type": contentType, "status-code": resp.StatusCode, "response": string(bodyBytes), "base-url": client.BaseURL, "request-path": path, "full-headers": resp.Header, }).Error("Received HTML response for API request") return nil, fmt.Errorf("received HTML response instead of JSON (status: %d). This often indicates an SSL/TLS issue or invalid authentication. Check your PAPERLESS_URL, PAPERLESS_TOKEN and PAPERLESS_INSECURE_SKIP_VERIFY settings. Full response: %s", resp.StatusCode, string(bodyBytes)) } } return resp, nil } // GetAllTags retrieves all tags from the Paperless-NGX API func (client *PaperlessClient) GetAllTags(ctx context.Context) (map[string]int, error) { tagIDMapping := make(map[string]int) path := "api/tags/" for path != "" { resp, err := client.Do(ctx, "GET", path, nil) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(resp.Body) return nil, fmt.Errorf("error fetching tags: %d, %s", resp.StatusCode, string(bodyBytes)) } var tagsResponse struct { Results []struct { ID int `json:"id"` Name string `json:"name"` } `json:"results"` Next string `json:"next"` } err = json.NewDecoder(resp.Body).Decode(&tagsResponse) if err != nil { return nil, err } for _, tag := range tagsResponse.Results { tagIDMapping[tag.Name] = tag.ID } // Extract relative path from the Next URL if tagsResponse.Next != "" { nextURL := tagsResponse.Next if strings.HasPrefix(nextURL, "http") { // Extract just the path portion from the full URL if parsedURL, err := url.Parse(nextURL); err == nil { path = strings.TrimPrefix(parsedURL.Path, "/") if parsedURL.RawQuery != "" { path += "?" + parsedURL.RawQuery } } else { return nil, fmt.Errorf("failed to parse next URL: %v", err) } } else { path = strings.TrimPrefix(nextURL, "/") } } else { path = "" } } return tagIDMapping, nil } // GetDocumentsByTags retrieves documents that match the specified tags func (client *PaperlessClient) GetDocumentsByTags(ctx context.Context, tags []string, pageSize int) ([]Document, error) { tagQueries := make([]string, len(tags)) for i, tag := range tags { tagQueries[i] = fmt.Sprintf("tags__name__iexact=%s", tag) } searchQuery := strings.Join(tagQueries, "&") path := fmt.Sprintf("api/documents/?%s&page_size=%d", urlEncode(searchQuery), pageSize) resp, err := client.Do(ctx, "GET", path, nil) if err != nil { return nil, fmt.Errorf("HTTP request failed in GetDocumentsByTags: %w", err) } defer resp.Body.Close() // Read the response body bodyBytes, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } if resp.StatusCode != http.StatusOK { log.WithFields(logrus.Fields{ "status_code": resp.StatusCode, "path": path, "response": string(bodyBytes), "headers": resp.Header, }).Error("Error response from server in GetDocumentsByTags") return nil, fmt.Errorf("error searching documents: status=%d, body=%s", resp.StatusCode, string(bodyBytes)) } var documentsResponse GetDocumentsApiResponse err = json.Unmarshal(bodyBytes, &documentsResponse) if err != nil { log.WithFields(logrus.Fields{ "response_body": string(bodyBytes), "error": err, }).Error("Failed to parse JSON response in GetDocumentsByTags") return nil, fmt.Errorf("failed to parse JSON response: %w", err) } allTags, err := client.GetAllTags(ctx) if err != nil { return nil, err } allCorrespondents, err := client.GetAllCorrespondents(ctx) if err != nil { return nil, err } documents := make([]Document, 0, len(documentsResponse.Results)) for _, result := range documentsResponse.Results { tagNames := make([]string, len(result.Tags)) for i, resultTagID := range result.Tags { for tagName, tagID := range allTags { if resultTagID == tagID { tagNames[i] = tagName break } } } correspondentName := "" if result.Correspondent != 0 { for name, id := range allCorrespondents { if result.Correspondent == id { correspondentName = name break } } } documents = append(documents, Document{ ID: result.ID, Title: result.Title, Content: result.Content, Correspondent: correspondentName, Tags: tagNames, }) } return documents, nil } // DownloadPDF downloads the PDF file of the specified document func (client *PaperlessClient) DownloadPDF(ctx context.Context, document Document) ([]byte, error) { path := fmt.Sprintf("api/documents/%d/download/", document.ID) resp, err := client.Do(ctx, "GET", path, nil) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(resp.Body) return nil, fmt.Errorf("error downloading document %d: %d, %s", document.ID, resp.StatusCode, string(bodyBytes)) } return io.ReadAll(resp.Body) } func (client *PaperlessClient) GetDocument(ctx context.Context, documentID int) (Document, error) { path := fmt.Sprintf("api/documents/%d/", documentID) resp, err := client.Do(ctx, "GET", path, nil) if err != nil { return Document{}, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(resp.Body) return Document{}, fmt.Errorf("error fetching document %d: %d, %s", documentID, resp.StatusCode, string(bodyBytes)) } var documentResponse GetDocumentApiResponse err = json.NewDecoder(resp.Body).Decode(&documentResponse) if err != nil { return Document{}, err } allTags, err := client.GetAllTags(ctx) if err != nil { return Document{}, err } allCorrespondents, err := client.GetAllCorrespondents(ctx) if err != nil { return Document{}, err } // Match tag IDs to tag names tagNames := make([]string, len(documentResponse.Tags)) for i, resultTagID := range documentResponse.Tags { for tagName, tagID := range allTags { if resultTagID == tagID { tagNames[i] = tagName break } } } // Match correspondent ID to correspondent name correspondentName := "" for name, id := range allCorrespondents { if documentResponse.Correspondent == id { correspondentName = name break } } return Document{ ID: documentResponse.ID, Title: documentResponse.Title, Content: documentResponse.Content, Correspondent: correspondentName, Tags: tagNames, }, nil } // UpdateDocuments updates the specified documents with suggested changes func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents []DocumentSuggestion, db *gorm.DB, isUndo bool) error { // Fetch all available tags availableTags, err := client.GetAllTags(ctx) if err != nil { log.Errorf("Error fetching available tags: %v", err) return err } documentsContainSuggestedCorrespondent := false for _, document := range documents { if document.SuggestedCorrespondent != "" { documentsContainSuggestedCorrespondent = true break } } availableCorrespondents := make(map[string]int) if documentsContainSuggestedCorrespondent { availableCorrespondents, err = client.GetAllCorrespondents(ctx) if err != nil { log.Errorf("Error fetching available correspondents: %v", err) return err } } for _, document := range documents { documentID := document.ID // Original fields will store any updated fields to store records for originalFields := make(map[string]interface{}) updatedFields := make(map[string]interface{}) newTags := []int{} tags := document.SuggestedTags originalTags := document.OriginalDocument.Tags originalTagsJSON, err := json.Marshal(originalTags) if err != nil { log.Errorf("Error marshalling JSON for document %d: %v", documentID, err) return err } // remove autoTag to prevent infinite loop (even if it is in the original tags) for _, tag := range document.RemoveTags { originalTags = removeTagFromList(originalTags, tag) } if len(tags) == 0 { tags = originalTags } else { // We have suggested tags to change originalFields["tags"] = originalTags // remove autoTag to prevent infinite loop - this is required in case of undo tags = removeTagFromList(tags, autoTag) // remove duplicates slices.Sort(tags) tags = slices.Compact(tags) } updatedTagsJSON, err := json.Marshal(tags) if err != nil { log.Errorf("Error marshalling JSON for document %d: %v", documentID, err) return err } // Map suggested tag names to IDs for _, tagName := range tags { if tagID, exists := availableTags[tagName]; exists { // Skip the tag that we are filtering if !isUndo && tagName == manualTag { continue } newTags = append(newTags, tagID) } else { log.Errorf("Suggested tag '%s' does not exist in paperless-ngx, skipping.", tagName) } } updatedFields["tags"] = newTags // Map suggested correspondent names to IDs if document.SuggestedCorrespondent != "" { if correspondentID, exists := availableCorrespondents[document.SuggestedCorrespondent]; exists { updatedFields["correspondent"] = correspondentID } else { newCorrespondent := instantiateCorrespondent(document.SuggestedCorrespondent) newCorrespondentID, err := client.CreateOrGetCorrespondent(context.Background(), newCorrespondent) if err != nil { log.Errorf("Error creating/getting correspondent with name %s: %v\n", document.SuggestedCorrespondent, err) return err } log.Infof("Using correspondent with name %s and ID %d\n", document.SuggestedCorrespondent, newCorrespondentID) updatedFields["correspondent"] = newCorrespondentID } } suggestedTitle := document.SuggestedTitle if len(suggestedTitle) > 128 { suggestedTitle = suggestedTitle[:128] } if suggestedTitle != "" { originalFields["title"] = document.OriginalDocument.Title updatedFields["title"] = suggestedTitle } else { log.Warnf("No valid title found for document %d, skipping.", documentID) } // Suggested Content suggestedContent := document.SuggestedContent if suggestedContent != "" { originalFields["content"] = document.OriginalDocument.Content updatedFields["content"] = suggestedContent } log.Debugf("Document %d: Original fields: %v", documentID, originalFields) log.Debugf("Document %d: Updated fields: %v Tags: %v", documentID, updatedFields, tags) // Marshal updated fields to JSON jsonData, err := json.Marshal(updatedFields) if err != nil { log.Errorf("Error marshalling JSON for document %d: %v", documentID, err) return err } // Send the update request using the generic Do method path := fmt.Sprintf("api/documents/%d/", documentID) resp, err := client.Do(ctx, "PATCH", path, bytes.NewBuffer(jsonData)) if err != nil { log.Errorf("Error updating document %d: %v", documentID, err) return err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(resp.Body) log.Errorf("Error updating document %d: %d, %s", documentID, resp.StatusCode, string(bodyBytes)) return fmt.Errorf("error updating document %d: %d, %s", documentID, resp.StatusCode, string(bodyBytes)) } else { for field, value := range originalFields { log.Printf("Document %d: Updated %s from %v to %v", documentID, field, originalFields[field], value) // Insert the modification record into the database var modificationRecord ModificationHistory if field == "tags" { // Make sure we only store changes where tags are changed - not the same before and after // And we have to use tags, not updatedFields as they are IDs not fields if !hasSameTags(document.OriginalDocument.Tags, tags) { modificationRecord = ModificationHistory{ DocumentID: uint(documentID), ModField: field, PreviousValue: string(originalTagsJSON), NewValue: string(updatedTagsJSON), } } } else { // Only store mod if field actually changed if originalFields[field] != updatedFields[field] { modificationRecord = ModificationHistory{ DocumentID: uint(documentID), ModField: field, PreviousValue: fmt.Sprintf("%v", originalFields[field]), NewValue: fmt.Sprintf("%v", updatedFields[field]), } } } // Only store if we have a valid modification record if (modificationRecord != ModificationHistory{}) { err = InsertModification(db, &modificationRecord) } if err != nil { log.Errorf("Error inserting modification record for document %d: %v", documentID, err) return err } } } log.Printf("Document %d updated successfully.", documentID) } return nil } // DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images // If limitPages > 0, only the first N pages will be processed func (client *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, documentId int, limitPages int) ([]string, error) { // Create a directory named after the document ID docDir := filepath.Join(client.GetCacheFolder(), fmt.Sprintf("document-%d", documentId)) if _, err := os.Stat(docDir); os.IsNotExist(err) { err = os.MkdirAll(docDir, 0755) if err != nil { return nil, err } } // Check if images already exist var imagePaths []string for n := 0; ; n++ { if limitPages > 0 && n >= limitPages { break } imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n)) if _, err := os.Stat(imagePath); os.IsNotExist(err) { break } imagePaths = append(imagePaths, imagePath) } // If images exist, return them if len(imagePaths) > 0 { return imagePaths, nil } // Proceed with downloading and converting the document to images path := fmt.Sprintf("api/documents/%d/download/", documentId) resp, err := client.Do(ctx, "GET", path, nil) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(resp.Body) return nil, fmt.Errorf("error downloading document %d: %d, %s", documentId, resp.StatusCode, string(bodyBytes)) } pdfData, err := io.ReadAll(resp.Body) if err != nil { return nil, err } tmpFile, err := os.CreateTemp("", "document-*.pdf") if err != nil { return nil, err } defer os.Remove(tmpFile.Name()) _, err = tmpFile.Write(pdfData) if err != nil { return nil, err } tmpFile.Close() doc, err := fitz.New(tmpFile.Name()) if err != nil { return nil, err } defer doc.Close() totalPages := doc.NumPage() if limitPages > 0 && limitPages < totalPages { totalPages = limitPages } var mu sync.Mutex var g errgroup.Group for n := 0; n < totalPages; n++ { n := n // capture loop variable g.Go(func() error { mu.Lock() // I assume the libmupdf library is not thread-safe img, err := doc.Image(n) mu.Unlock() if err != nil { return err } imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n)) f, err := os.Create(imagePath) if err != nil { return err } err = jpeg.Encode(f, img, &jpeg.Options{Quality: jpeg.DefaultQuality}) if err != nil { f.Close() return err } f.Close() // Verify the JPEG file file, err := os.Open(imagePath) if err != nil { return err } defer file.Close() _, err = jpeg.Decode(file) if err != nil { return fmt.Errorf("invalid JPEG file: %s", imagePath) } mu.Lock() imagePaths = append(imagePaths, imagePath) mu.Unlock() return nil }) } if err := g.Wait(); err != nil { return nil, err } // sort the image paths to ensure they are in order slices.Sort(imagePaths) return imagePaths, nil } // GetCacheFolder returns the cache folder for the PaperlessClient func (client *PaperlessClient) GetCacheFolder() string { if client.CacheFolder == "" { client.CacheFolder = filepath.Join(os.TempDir(), "paperless-gpt") } return client.CacheFolder } // urlEncode encodes a string for safe URL usage func urlEncode(s string) string { return strings.ReplaceAll(s, " ", "+") } // instantiateCorrespondent creates a new Correspondent object with default values func instantiateCorrespondent(name string) Correspondent { return Correspondent{ Name: name, MatchingAlgorithm: 0, Match: "", IsInsensitive: true, Owner: nil, } } // CreateOrGetCorrespondent creates a new correspondent or returns existing one if name already exists func (client *PaperlessClient) CreateOrGetCorrespondent(ctx context.Context, correspondent Correspondent) (int, error) { // First try to find existing correspondent correspondents, err := client.GetAllCorrespondents(ctx) if err != nil { return 0, fmt.Errorf("error fetching correspondents: %w", err) } // Check if correspondent already exists if id, exists := correspondents[correspondent.Name]; exists { log.Infof("Using existing correspondent with name %s and ID %d", correspondent.Name, id) return id, nil } // If not found, create new correspondent url := "api/correspondents/" jsonData, err := json.Marshal(correspondent) if err != nil { return 0, err } resp, err := client.Do(ctx, "POST", url, bytes.NewBuffer(jsonData)) if err != nil { return 0, err } defer resp.Body.Close() if resp.StatusCode != http.StatusCreated { bodyBytes, _ := io.ReadAll(resp.Body) return 0, fmt.Errorf("error creating correspondent: %d, %s", resp.StatusCode, string(bodyBytes)) } var createdCorrespondent struct { ID int `json:"id"` } err = json.NewDecoder(resp.Body).Decode(&createdCorrespondent) if err != nil { return 0, err } return createdCorrespondent.ID, nil } // CorrespondentResponse represents the response structure for correspondents type CorrespondentResponse struct { Results []struct { ID int `json:"id"` Name string `json:"name"` } `json:"results"` } // GetAllCorrespondents retrieves all correspondents from the Paperless-NGX API func (client *PaperlessClient) GetAllCorrespondents(ctx context.Context) (map[string]int, error) { correspondentIDMapping := make(map[string]int) path := "api/correspondents/?page_size=9999" resp, err := client.Do(ctx, "GET", path, nil) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(resp.Body) return nil, fmt.Errorf("error fetching correspondents: %d, %s", resp.StatusCode, string(bodyBytes)) } var correspondentsResponse CorrespondentResponse err = json.NewDecoder(resp.Body).Decode(&correspondentsResponse) if err != nil { return nil, err } for _, correspondent := range correspondentsResponse.Results { correspondentIDMapping[correspondent.Name] = correspondent.ID } return correspondentIDMapping, nil }