2024-10-21 14:52:23 -05:00
package main
import (
"bytes"
"context"
2025-02-12 08:47:29 -06:00
"crypto/tls"
2024-10-21 14:52:23 -05:00
"encoding/json"
"fmt"
2024-10-28 11:34:41 -05:00
"image/jpeg"
2024-10-21 14:52:23 -05:00
"io"
"net/http"
2025-02-12 08:47:29 -06:00
"net/url"
2024-10-28 11:34:41 -05:00
"os"
"path/filepath"
2024-10-28 17:05:25 -05:00
"slices"
2024-12-13 09:48:09 -06:00
"sort"
2024-10-21 14:52:23 -05:00
"strings"
2024-10-28 11:34:41 -05:00
"sync"
"github.com/gen2brain/go-fitz"
2025-02-12 08:47:29 -06:00
"github.com/sirupsen/logrus"
2024-10-28 11:34:41 -05:00
"golang.org/x/sync/errgroup"
2024-12-13 09:48:09 -06:00
"gorm.io/gorm"
2024-10-21 14:52:23 -05:00
)
// PaperlessClient struct to interact with the Paperless-NGX API
type PaperlessClient struct {
2024-10-28 11:34:41 -05:00
BaseURL string
APIToken string
HTTPClient * http . Client
CacheFolder string
2024-10-21 14:52:23 -05:00
}
2024-12-13 09:48:09 -06:00
func hasSameTags ( original , suggested [ ] string ) bool {
if len ( original ) != len ( suggested ) {
return false
}
// Create copies to avoid modifying original slices
orig := make ( [ ] string , len ( original ) )
sugg := make ( [ ] string , len ( suggested ) )
copy ( orig , original )
copy ( sugg , suggested )
// Sort both slices
sort . Strings ( orig )
sort . Strings ( sugg )
// Compare elements
for i := range orig {
if orig [ i ] != sugg [ i ] {
return false
}
}
return true
}
2024-10-21 14:52:23 -05:00
// NewPaperlessClient creates a new instance of PaperlessClient with a default HTTP client
func NewPaperlessClient ( baseURL , apiToken string ) * PaperlessClient {
2024-10-28 11:34:41 -05:00
cacheFolder := os . Getenv ( "PAPERLESS_GPT_CACHE_DIR" )
2025-02-12 08:47:29 -06:00
// Create a custom HTTP transport with TLS configuration
tr := & http . Transport {
TLSClientConfig : & tls . Config {
InsecureSkipVerify : paperlessInsecureSkipVerify ,
} ,
}
httpClient := & http . Client { Transport : tr }
2024-10-21 14:52:23 -05:00
return & PaperlessClient {
2024-10-28 11:34:41 -05:00
BaseURL : strings . TrimRight ( baseURL , "/" ) ,
APIToken : apiToken ,
2025-02-12 08:47:29 -06:00
HTTPClient : httpClient ,
2024-10-28 11:34:41 -05:00
CacheFolder : cacheFolder ,
2024-10-21 14:52:23 -05:00
}
}
// Do method to make requests to the Paperless-NGX API
2025-01-13 08:59:29 -06:00
func ( client * PaperlessClient ) Do ( ctx context . Context , method , path string , body io . Reader ) ( * http . Response , error ) {
url := fmt . Sprintf ( "%s/%s" , client . BaseURL , strings . TrimLeft ( path , "/" ) )
2024-10-21 14:52:23 -05:00
req , err := http . NewRequestWithContext ( ctx , method , url , body )
if err != nil {
return nil , err
}
2025-01-13 08:59:29 -06:00
req . Header . Set ( "Authorization" , fmt . Sprintf ( "Token %s" , client . APIToken ) )
2024-10-21 14:52:23 -05:00
// Set Content-Type if body is present
if body != nil {
req . Header . Set ( "Content-Type" , "application/json" )
}
2025-02-12 08:47:29 -06:00
log . WithFields ( logrus . Fields {
"method" : method ,
"url" : url ,
} ) . Debug ( "Making HTTP request" )
resp , err := client . HTTPClient . Do ( req )
if err != nil {
log . WithError ( err ) . WithFields ( logrus . Fields {
"url" : url ,
"method" : method ,
"error" : err ,
} ) . Error ( "HTTP request failed" )
return nil , fmt . Errorf ( "HTTP request failed: %w" , err )
}
// Check if response is HTML instead of JSON for API endpoints
if strings . HasPrefix ( path , "api/" ) {
contentType := resp . Header . Get ( "Content-Type" )
if strings . Contains ( contentType , "text/html" ) {
bodyBytes , _ := io . ReadAll ( resp . Body )
resp . Body . Close ( )
// Create a new response with the same body for the caller
resp = & http . Response {
Status : resp . Status ,
StatusCode : resp . StatusCode ,
Header : resp . Header ,
Body : io . NopCloser ( bytes . NewBuffer ( bodyBytes ) ) ,
}
log . WithFields ( logrus . Fields {
"url" : url ,
"method" : method ,
"content-type" : contentType ,
"status-code" : resp . StatusCode ,
"response" : string ( bodyBytes ) ,
"base-url" : client . BaseURL ,
"request-path" : path ,
"full-headers" : resp . Header ,
} ) . Error ( "Received HTML response for API request" )
return nil , fmt . Errorf ( "received HTML response instead of JSON (status: %d). This often indicates an SSL/TLS issue or invalid authentication. Check your PAPERLESS_URL, PAPERLESS_TOKEN and PAPERLESS_INSECURE_SKIP_VERIFY settings. Full response: %s" , resp . StatusCode , string ( bodyBytes ) )
}
}
return resp , nil
2024-10-21 14:52:23 -05:00
}
// GetAllTags retrieves all tags from the Paperless-NGX API
2025-01-13 08:59:29 -06:00
func ( client * PaperlessClient ) GetAllTags ( ctx context . Context ) ( map [ string ] int , error ) {
2024-10-21 14:52:23 -05:00
tagIDMapping := make ( map [ string ] int )
path := "api/tags/"
for path != "" {
2025-01-13 08:59:29 -06:00
resp , err := client . Do ( ctx , "GET" , path , nil )
2024-10-21 14:52:23 -05:00
if err != nil {
return nil , err
}
defer resp . Body . Close ( )
if resp . StatusCode != http . StatusOK {
bodyBytes , _ := io . ReadAll ( resp . Body )
return nil , fmt . Errorf ( "error fetching tags: %d, %s" , resp . StatusCode , string ( bodyBytes ) )
}
var tagsResponse struct {
Results [ ] struct {
ID int ` json:"id" `
Name string ` json:"name" `
} ` json:"results" `
Next string ` json:"next" `
}
err = json . NewDecoder ( resp . Body ) . Decode ( & tagsResponse )
if err != nil {
return nil , err
}
for _ , tag := range tagsResponse . Results {
tagIDMapping [ tag . Name ] = tag . ID
}
// Extract relative path from the Next URL
if tagsResponse . Next != "" {
nextURL := tagsResponse . Next
2025-02-12 08:47:29 -06:00
if strings . HasPrefix ( nextURL , "http" ) {
// Extract just the path portion from the full URL
if parsedURL , err := url . Parse ( nextURL ) ; err == nil {
path = strings . TrimPrefix ( parsedURL . Path , "/" )
if parsedURL . RawQuery != "" {
path += "?" + parsedURL . RawQuery
}
} else {
return nil , fmt . Errorf ( "failed to parse next URL: %v" , err )
}
} else {
path = strings . TrimPrefix ( nextURL , "/" )
2024-10-21 14:52:23 -05:00
}
} else {
path = ""
}
}
return tagIDMapping , nil
}
// GetDocumentsByTags retrieves documents that match the specified tags
2025-01-13 08:59:29 -06:00
func ( client * PaperlessClient ) GetDocumentsByTags ( ctx context . Context , tags [ ] string , pageSize int ) ( [ ] Document , error ) {
2024-10-21 14:52:23 -05:00
tagQueries := make ( [ ] string , len ( tags ) )
for i , tag := range tags {
2024-11-11 03:40:39 -06:00
tagQueries [ i ] = fmt . Sprintf ( "tags__name__iexact=%s" , tag )
2024-10-21 14:52:23 -05:00
}
2024-11-11 03:40:39 -06:00
searchQuery := strings . Join ( tagQueries , "&" )
2025-01-13 08:59:29 -06:00
path := fmt . Sprintf ( "api/documents/?%s&page_size=%d" , urlEncode ( searchQuery ) , pageSize )
2024-10-21 14:52:23 -05:00
2025-01-13 08:59:29 -06:00
resp , err := client . Do ( ctx , "GET" , path , nil )
2024-10-21 14:52:23 -05:00
if err != nil {
2025-02-12 08:47:29 -06:00
return nil , fmt . Errorf ( "HTTP request failed in GetDocumentsByTags: %w" , err )
2024-10-21 14:52:23 -05:00
}
defer resp . Body . Close ( )
2025-02-12 08:47:29 -06:00
// Read the response body
bodyBytes , err := io . ReadAll ( resp . Body )
if err != nil {
return nil , fmt . Errorf ( "failed to read response body: %w" , err )
}
2024-10-21 14:52:23 -05:00
if resp . StatusCode != http . StatusOK {
2025-02-12 08:47:29 -06:00
log . WithFields ( logrus . Fields {
"status_code" : resp . StatusCode ,
"path" : path ,
"response" : string ( bodyBytes ) ,
"headers" : resp . Header ,
} ) . Error ( "Error response from server in GetDocumentsByTags" )
return nil , fmt . Errorf ( "error searching documents: status=%d, body=%s" , resp . StatusCode , string ( bodyBytes ) )
2024-10-21 14:52:23 -05:00
}
var documentsResponse GetDocumentsApiResponse
2025-02-12 08:47:29 -06:00
err = json . Unmarshal ( bodyBytes , & documentsResponse )
2024-10-21 14:52:23 -05:00
if err != nil {
2025-02-12 08:47:29 -06:00
log . WithFields ( logrus . Fields {
"response_body" : string ( bodyBytes ) ,
"error" : err ,
} ) . Error ( "Failed to parse JSON response in GetDocumentsByTags" )
return nil , fmt . Errorf ( "failed to parse JSON response: %w" , err )
2024-10-21 14:52:23 -05:00
}
2025-01-13 08:59:29 -06:00
allTags , err := client . GetAllTags ( ctx )
if err != nil {
return nil , err
}
allCorrespondents , err := client . GetAllCorrespondents ( ctx )
2024-10-21 14:52:23 -05:00
if err != nil {
return nil , err
}
documents := make ( [ ] Document , 0 , len ( documentsResponse . Results ) )
for _ , result := range documentsResponse . Results {
tagNames := make ( [ ] string , len ( result . Tags ) )
for i , resultTagID := range result . Tags {
for tagName , tagID := range allTags {
if resultTagID == tagID {
tagNames [ i ] = tagName
break
}
}
}
2025-01-13 08:59:29 -06:00
correspondentName := ""
if result . Correspondent != 0 {
for name , id := range allCorrespondents {
if result . Correspondent == id {
correspondentName = name
break
}
}
}
2024-10-21 14:52:23 -05:00
documents = append ( documents , Document {
2025-01-13 08:59:29 -06:00
ID : result . ID ,
Title : result . Title ,
Content : result . Content ,
Correspondent : correspondentName ,
Tags : tagNames ,
2024-10-21 14:52:23 -05:00
} )
}
return documents , nil
}
// DownloadPDF downloads the PDF file of the specified document
2025-01-13 08:59:29 -06:00
func ( client * PaperlessClient ) DownloadPDF ( ctx context . Context , document Document ) ( [ ] byte , error ) {
2024-10-21 14:52:23 -05:00
path := fmt . Sprintf ( "api/documents/%d/download/" , document . ID )
2025-01-13 08:59:29 -06:00
resp , err := client . Do ( ctx , "GET" , path , nil )
2024-10-21 14:52:23 -05:00
if err != nil {
return nil , err
}
defer resp . Body . Close ( )
if resp . StatusCode != http . StatusOK {
bodyBytes , _ := io . ReadAll ( resp . Body )
return nil , fmt . Errorf ( "error downloading document %d: %d, %s" , document . ID , resp . StatusCode , string ( bodyBytes ) )
}
return io . ReadAll ( resp . Body )
}
2025-01-13 08:59:29 -06:00
func ( client * PaperlessClient ) GetDocument ( ctx context . Context , documentID int ) ( Document , error ) {
2024-10-28 11:34:41 -05:00
path := fmt . Sprintf ( "api/documents/%d/" , documentID )
2025-01-13 08:59:29 -06:00
resp , err := client . Do ( ctx , "GET" , path , nil )
2024-10-28 11:34:41 -05:00
if err != nil {
return Document { } , err
}
defer resp . Body . Close ( )
if resp . StatusCode != http . StatusOK {
bodyBytes , _ := io . ReadAll ( resp . Body )
return Document { } , fmt . Errorf ( "error fetching document %d: %d, %s" , documentID , resp . StatusCode , string ( bodyBytes ) )
}
var documentResponse GetDocumentApiResponse
err = json . NewDecoder ( resp . Body ) . Decode ( & documentResponse )
if err != nil {
return Document { } , err
}
2025-01-13 08:59:29 -06:00
allTags , err := client . GetAllTags ( ctx )
2024-10-28 11:34:41 -05:00
if err != nil {
return Document { } , err
}
2025-01-13 08:59:29 -06:00
allCorrespondents , err := client . GetAllCorrespondents ( ctx )
if err != nil {
return Document { } , err
}
// Match tag IDs to tag names
2024-10-28 11:34:41 -05:00
tagNames := make ( [ ] string , len ( documentResponse . Tags ) )
for i , resultTagID := range documentResponse . Tags {
for tagName , tagID := range allTags {
if resultTagID == tagID {
tagNames [ i ] = tagName
break
}
}
}
2025-01-13 08:59:29 -06:00
// Match correspondent ID to correspondent name
correspondentName := ""
for name , id := range allCorrespondents {
if documentResponse . Correspondent == id {
correspondentName = name
break
}
}
2024-10-28 11:34:41 -05:00
return Document {
2025-01-13 08:59:29 -06:00
ID : documentResponse . ID ,
Title : documentResponse . Title ,
Content : documentResponse . Content ,
Correspondent : correspondentName ,
Tags : tagNames ,
2024-10-28 11:34:41 -05:00
} , nil
}
2024-10-21 14:52:23 -05:00
// UpdateDocuments updates the specified documents with suggested changes
2025-01-13 08:59:29 -06:00
func ( client * PaperlessClient ) UpdateDocuments ( ctx context . Context , documents [ ] DocumentSuggestion , db * gorm . DB , isUndo bool ) error {
2024-10-21 14:52:23 -05:00
// Fetch all available tags
2025-01-13 08:59:29 -06:00
availableTags , err := client . GetAllTags ( ctx )
2024-10-21 14:52:23 -05:00
if err != nil {
2024-10-31 14:00:43 -05:00
log . Errorf ( "Error fetching available tags: %v" , err )
2024-10-21 14:52:23 -05:00
return err
}
2025-01-13 08:59:29 -06:00
documentsContainSuggestedCorrespondent := false
for _ , document := range documents {
if document . SuggestedCorrespondent != "" {
documentsContainSuggestedCorrespondent = true
break
}
}
availableCorrespondents := make ( map [ string ] int )
if documentsContainSuggestedCorrespondent {
availableCorrespondents , err = client . GetAllCorrespondents ( ctx )
if err != nil {
log . Errorf ( "Error fetching available correspondents: %v" ,
err )
return err
}
}
2024-10-21 14:52:23 -05:00
for _ , document := range documents {
documentID := document . ID
2024-12-13 09:48:09 -06:00
// Original fields will store any updated fields to store records for
originalFields := make ( map [ string ] interface { } )
2024-10-21 14:52:23 -05:00
updatedFields := make ( map [ string ] interface { } )
newTags := [ ] int { }
tags := document . SuggestedTags
2024-12-13 09:48:09 -06:00
originalTags := document . OriginalDocument . Tags
originalTagsJSON , err := json . Marshal ( originalTags )
if err != nil {
log . Errorf ( "Error marshalling JSON for document %d: %v" , documentID , err )
return err
2024-10-21 14:52:23 -05:00
}
2024-12-13 09:48:09 -06:00
2024-10-21 16:46:22 -05:00
// remove autoTag to prevent infinite loop (even if it is in the original tags)
2025-01-13 03:52:56 -06:00
for _ , tag := range document . RemoveTags {
originalTags = removeTagFromList ( originalTags , tag )
}
2024-12-13 09:48:09 -06:00
if len ( tags ) == 0 {
tags = originalTags
} else {
// We have suggested tags to change
originalFields [ "tags" ] = originalTags
// remove autoTag to prevent infinite loop - this is required in case of undo
tags = removeTagFromList ( tags , autoTag )
2025-01-13 07:09:03 -06:00
// remove duplicates
slices . Sort ( tags )
tags = slices . Compact ( tags )
2024-12-13 09:48:09 -06:00
}
updatedTagsJSON , err := json . Marshal ( tags )
if err != nil {
log . Errorf ( "Error marshalling JSON for document %d: %v" , documentID , err )
return err
}
2024-10-21 14:52:23 -05:00
// Map suggested tag names to IDs
for _ , tagName := range tags {
if tagID , exists := availableTags [ tagName ] ; exists {
// Skip the tag that we are filtering
2024-12-13 09:48:09 -06:00
if ! isUndo && tagName == manualTag {
2024-10-21 14:52:23 -05:00
continue
}
newTags = append ( newTags , tagID )
} else {
2025-01-13 08:59:29 -06:00
log . Errorf ( "Suggested tag '%s' does not exist in paperless-ngx, skipping." , tagName )
2024-10-21 14:52:23 -05:00
}
}
updatedFields [ "tags" ] = newTags
2025-01-13 08:59:29 -06:00
// Map suggested correspondent names to IDs
if document . SuggestedCorrespondent != "" {
if correspondentID , exists := availableCorrespondents [ document . SuggestedCorrespondent ] ; exists {
updatedFields [ "correspondent" ] = correspondentID
} else {
newCorrespondent := instantiateCorrespondent ( document . SuggestedCorrespondent )
2025-02-08 12:30:28 -06:00
newCorrespondentID , err := client . CreateOrGetCorrespondent ( context . Background ( ) , newCorrespondent )
2025-01-13 08:59:29 -06:00
if err != nil {
2025-02-08 12:30:28 -06:00
log . Errorf ( "Error creating/getting correspondent with name %s: %v\n" , document . SuggestedCorrespondent , err )
2025-01-13 08:59:29 -06:00
return err
}
2025-02-08 12:30:28 -06:00
log . Infof ( "Using correspondent with name %s and ID %d\n" , document . SuggestedCorrespondent , newCorrespondentID )
2025-01-13 08:59:29 -06:00
updatedFields [ "correspondent" ] = newCorrespondentID
}
}
2024-10-21 14:52:23 -05:00
suggestedTitle := document . SuggestedTitle
if len ( suggestedTitle ) > 128 {
suggestedTitle = suggestedTitle [ : 128 ]
}
if suggestedTitle != "" {
2024-12-13 09:48:09 -06:00
originalFields [ "title" ] = document . OriginalDocument . Title
2024-10-21 14:52:23 -05:00
updatedFields [ "title" ] = suggestedTitle
} else {
2024-10-31 14:00:43 -05:00
log . Warnf ( "No valid title found for document %d, skipping." , documentID )
2024-10-21 14:52:23 -05:00
}
2024-10-28 11:34:41 -05:00
// Suggested Content
suggestedContent := document . SuggestedContent
if suggestedContent != "" {
2024-12-13 09:48:09 -06:00
originalFields [ "content" ] = document . OriginalDocument . Content
2024-10-28 11:34:41 -05:00
updatedFields [ "content" ] = suggestedContent
}
2024-12-13 09:48:09 -06:00
log . Debugf ( "Document %d: Original fields: %v" , documentID , originalFields )
log . Debugf ( "Document %d: Updated fields: %v Tags: %v" , documentID , updatedFields , tags )
2024-10-28 11:34:41 -05:00
2024-10-21 14:52:23 -05:00
// Marshal updated fields to JSON
jsonData , err := json . Marshal ( updatedFields )
if err != nil {
2024-10-31 14:00:43 -05:00
log . Errorf ( "Error marshalling JSON for document %d: %v" , documentID , err )
2024-10-21 14:52:23 -05:00
return err
}
// Send the update request using the generic Do method
path := fmt . Sprintf ( "api/documents/%d/" , documentID )
2025-01-13 08:59:29 -06:00
resp , err := client . Do ( ctx , "PATCH" , path , bytes . NewBuffer ( jsonData ) )
2024-10-21 14:52:23 -05:00
if err != nil {
2024-10-31 14:00:43 -05:00
log . Errorf ( "Error updating document %d: %v" , documentID , err )
2024-10-21 14:52:23 -05:00
return err
}
defer resp . Body . Close ( )
if resp . StatusCode != http . StatusOK {
bodyBytes , _ := io . ReadAll ( resp . Body )
2024-10-31 14:00:43 -05:00
log . Errorf ( "Error updating document %d: %d, %s" , documentID , resp . StatusCode , string ( bodyBytes ) )
2024-10-21 14:52:23 -05:00
return fmt . Errorf ( "error updating document %d: %d, %s" , documentID , resp . StatusCode , string ( bodyBytes ) )
2024-12-13 09:48:09 -06:00
} else {
for field , value := range originalFields {
log . Printf ( "Document %d: Updated %s from %v to %v" , documentID , field , originalFields [ field ] , value )
// Insert the modification record into the database
var modificationRecord ModificationHistory
if field == "tags" {
// Make sure we only store changes where tags are changed - not the same before and after
// And we have to use tags, not updatedFields as they are IDs not fields
if ! hasSameTags ( document . OriginalDocument . Tags , tags ) {
modificationRecord = ModificationHistory {
DocumentID : uint ( documentID ) ,
ModField : field ,
PreviousValue : string ( originalTagsJSON ) ,
NewValue : string ( updatedTagsJSON ) ,
}
}
} else {
// Only store mod if field actually changed
if originalFields [ field ] != updatedFields [ field ] {
modificationRecord = ModificationHistory {
DocumentID : uint ( documentID ) ,
ModField : field ,
PreviousValue : fmt . Sprintf ( "%v" , originalFields [ field ] ) ,
NewValue : fmt . Sprintf ( "%v" , updatedFields [ field ] ) ,
}
}
}
// Only store if we have a valid modification record
if ( modificationRecord != ModificationHistory { } ) {
err = InsertModification ( db , & modificationRecord )
}
if err != nil {
log . Errorf ( "Error inserting modification record for document %d: %v" , documentID , err )
return err
}
}
2024-10-21 14:52:23 -05:00
}
log . Printf ( "Document %d updated successfully." , documentID )
}
return nil
}
2024-10-28 11:34:41 -05:00
// DownloadDocumentAsImages downloads the PDF file of the specified document and converts it to images
2025-01-10 10:03:53 -06:00
// If limitPages > 0, only the first N pages will be processed
2025-01-13 08:59:29 -06:00
func ( client * PaperlessClient ) DownloadDocumentAsImages ( ctx context . Context , documentId int , limitPages int ) ( [ ] string , error ) {
2024-10-28 11:34:41 -05:00
// Create a directory named after the document ID
2025-01-13 08:59:29 -06:00
docDir := filepath . Join ( client . GetCacheFolder ( ) , fmt . Sprintf ( "document-%d" , documentId ) )
2024-10-28 11:34:41 -05:00
if _ , err := os . Stat ( docDir ) ; os . IsNotExist ( err ) {
err = os . MkdirAll ( docDir , 0755 )
if err != nil {
return nil , err
}
}
// Check if images already exist
var imagePaths [ ] string
for n := 0 ; ; n ++ {
2025-01-10 10:03:53 -06:00
if limitPages > 0 && n >= limitPages {
break
}
2024-10-28 11:34:41 -05:00
imagePath := filepath . Join ( docDir , fmt . Sprintf ( "page%03d.jpg" , n ) )
if _ , err := os . Stat ( imagePath ) ; os . IsNotExist ( err ) {
break
}
imagePaths = append ( imagePaths , imagePath )
}
// If images exist, return them
if len ( imagePaths ) > 0 {
return imagePaths , nil
}
// Proceed with downloading and converting the document to images
path := fmt . Sprintf ( "api/documents/%d/download/" , documentId )
2025-01-13 08:59:29 -06:00
resp , err := client . Do ( ctx , "GET" , path , nil )
2024-10-28 11:34:41 -05:00
if err != nil {
return nil , err
}
defer resp . Body . Close ( )
if resp . StatusCode != http . StatusOK {
bodyBytes , _ := io . ReadAll ( resp . Body )
return nil , fmt . Errorf ( "error downloading document %d: %d, %s" , documentId , resp . StatusCode , string ( bodyBytes ) )
}
pdfData , err := io . ReadAll ( resp . Body )
if err != nil {
return nil , err
}
tmpFile , err := os . CreateTemp ( "" , "document-*.pdf" )
if err != nil {
return nil , err
}
defer os . Remove ( tmpFile . Name ( ) )
_ , err = tmpFile . Write ( pdfData )
if err != nil {
return nil , err
}
tmpFile . Close ( )
doc , err := fitz . New ( tmpFile . Name ( ) )
if err != nil {
return nil , err
}
defer doc . Close ( )
2025-01-10 10:03:53 -06:00
totalPages := doc . NumPage ( )
if limitPages > 0 && limitPages < totalPages {
totalPages = limitPages
}
2024-10-28 11:34:41 -05:00
var mu sync . Mutex
var g errgroup . Group
2025-01-10 10:03:53 -06:00
for n := 0 ; n < totalPages ; n ++ {
2024-10-28 11:34:41 -05:00
n := n // capture loop variable
g . Go ( func ( ) error {
mu . Lock ( )
// I assume the libmupdf library is not thread-safe
img , err := doc . Image ( n )
mu . Unlock ( )
if err != nil {
return err
}
imagePath := filepath . Join ( docDir , fmt . Sprintf ( "page%03d.jpg" , n ) )
f , err := os . Create ( imagePath )
if err != nil {
return err
}
err = jpeg . Encode ( f , img , & jpeg . Options { Quality : jpeg . DefaultQuality } )
if err != nil {
f . Close ( )
return err
}
f . Close ( )
// Verify the JPEG file
file , err := os . Open ( imagePath )
if err != nil {
return err
}
defer file . Close ( )
_ , err = jpeg . Decode ( file )
if err != nil {
return fmt . Errorf ( "invalid JPEG file: %s" , imagePath )
}
mu . Lock ( )
imagePaths = append ( imagePaths , imagePath )
mu . Unlock ( )
return nil
} )
}
if err := g . Wait ( ) ; err != nil {
return nil , err
}
2024-10-28 17:05:25 -05:00
// sort the image paths to ensure they are in order
slices . Sort ( imagePaths )
2024-10-28 11:34:41 -05:00
return imagePaths , nil
}
// GetCacheFolder returns the cache folder for the PaperlessClient
2025-01-13 08:59:29 -06:00
func ( client * PaperlessClient ) GetCacheFolder ( ) string {
if client . CacheFolder == "" {
client . CacheFolder = filepath . Join ( os . TempDir ( ) , "paperless-gpt" )
2024-10-28 11:34:41 -05:00
}
2025-01-13 08:59:29 -06:00
return client . CacheFolder
2024-10-28 11:34:41 -05:00
}
2024-10-21 14:52:23 -05:00
// urlEncode encodes a string for safe URL usage
func urlEncode ( s string ) string {
return strings . ReplaceAll ( s , " " , "+" )
}
2025-01-13 08:59:29 -06:00
// instantiateCorrespondent creates a new Correspondent object with default values
func instantiateCorrespondent ( name string ) Correspondent {
return Correspondent {
Name : name ,
MatchingAlgorithm : 0 ,
Match : "" ,
IsInsensitive : true ,
Owner : nil ,
}
}
2025-02-08 12:30:28 -06:00
// CreateOrGetCorrespondent creates a new correspondent or returns existing one if name already exists
func ( client * PaperlessClient ) CreateOrGetCorrespondent ( ctx context . Context , correspondent Correspondent ) ( int , error ) {
// First try to find existing correspondent
correspondents , err := client . GetAllCorrespondents ( ctx )
if err != nil {
return 0 , fmt . Errorf ( "error fetching correspondents: %w" , err )
}
2025-01-13 08:59:29 -06:00
2025-02-08 12:30:28 -06:00
// Check if correspondent already exists
if id , exists := correspondents [ correspondent . Name ] ; exists {
log . Infof ( "Using existing correspondent with name %s and ID %d" , correspondent . Name , id )
return id , nil
}
// If not found, create new correspondent
url := "api/correspondents/"
2025-01-13 08:59:29 -06:00
jsonData , err := json . Marshal ( correspondent )
if err != nil {
return 0 , err
}
resp , err := client . Do ( ctx , "POST" , url , bytes . NewBuffer ( jsonData ) )
if err != nil {
return 0 , err
}
defer resp . Body . Close ( )
if resp . StatusCode != http . StatusCreated {
bodyBytes , _ := io . ReadAll ( resp . Body )
return 0 , fmt . Errorf ( "error creating correspondent: %d, %s" , resp . StatusCode , string ( bodyBytes ) )
}
var createdCorrespondent struct {
ID int ` json:"id" `
}
err = json . NewDecoder ( resp . Body ) . Decode ( & createdCorrespondent )
if err != nil {
return 0 , err
}
return createdCorrespondent . ID , nil
}
// CorrespondentResponse represents the response structure for correspondents
type CorrespondentResponse struct {
Results [ ] struct {
ID int ` json:"id" `
Name string ` json:"name" `
} ` json:"results" `
}
// GetAllCorrespondents retrieves all correspondents from the Paperless-NGX API
func ( client * PaperlessClient ) GetAllCorrespondents ( ctx context . Context ) ( map [ string ] int , error ) {
correspondentIDMapping := make ( map [ string ] int )
path := "api/correspondents/?page_size=9999"
resp , err := client . Do ( ctx , "GET" , path , nil )
if err != nil {
return nil , err
}
defer resp . Body . Close ( )
if resp . StatusCode != http . StatusOK {
bodyBytes , _ := io . ReadAll ( resp . Body )
return nil , fmt . Errorf ( "error fetching correspondents: %d, %s" , resp . StatusCode , string ( bodyBytes ) )
}
var correspondentsResponse CorrespondentResponse
err = json . NewDecoder ( resp . Body ) . Decode ( & correspondentsResponse )
if err != nil {
return nil , err
}
for _ , correspondent := range correspondentsResponse . Results {
correspondentIDMapping [ correspondent . Name ] = correspondent . ID
}
return correspondentIDMapping , nil
}