mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-13 13:18:02 -05:00
Make PDF processing concurrent
This commit is contained in:
parent
8c7bd7273f
commit
223648ee55
3 changed files with 45 additions and 25 deletions
1
go.mod
1
go.mod
|
@ -10,6 +10,7 @@ require (
|
|||
github.com/gin-gonic/gin v1.10.0
|
||||
github.com/stretchr/testify v1.9.0
|
||||
github.com/tmc/langchaingo v0.1.12
|
||||
golang.org/x/sync v0.7.0
|
||||
)
|
||||
|
||||
require (
|
||||
|
|
2
go.sum
2
go.sum
|
@ -117,6 +117,8 @@ golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
|
|||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
|
||||
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
|
|
27
paperless.go
27
paperless.go
|
@ -12,8 +12,10 @@ import (
|
|||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/gen2brain/go-fitz"
|
||||
"golang.org/x/sync/errgroup"
|
||||
)
|
||||
|
||||
// PaperlessClient struct to interact with the Paperless-NGX API
|
||||
|
@ -307,37 +309,52 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
|
|||
}
|
||||
defer doc.Close()
|
||||
|
||||
var mu sync.Mutex
|
||||
var g errgroup.Group
|
||||
|
||||
for n := 0; n < doc.NumPage(); n++ {
|
||||
n := n // capture loop variable
|
||||
g.Go(func() error {
|
||||
img, err := doc.Image(n)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
|
||||
f, err := os.Create(imagePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
err = jpeg.Encode(f, img, &jpeg.Options{Quality: jpeg.DefaultQuality})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
f.Close()
|
||||
return err
|
||||
}
|
||||
f.Close()
|
||||
|
||||
// Verify the JPEG file
|
||||
file, err := os.Open(imagePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
_, err = jpeg.Decode(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid JPEG file: %s", imagePath)
|
||||
return fmt.Errorf("invalid JPEG file: %s", imagePath)
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
imagePaths = append(imagePaths, imagePath)
|
||||
mu.Unlock()
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
if err := g.Wait(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return imagePaths, nil
|
||||
|
|
Loading…
Reference in a new issue