Make PDF processing concurrent

This commit is contained in:
Dominik Schröter 2024-10-26 09:16:14 +02:00
parent 8c7bd7273f
commit 223648ee55
3 changed files with 45 additions and 25 deletions

1
go.mod
View file

@ -10,6 +10,7 @@ require (
github.com/gin-gonic/gin v1.10.0
github.com/stretchr/testify v1.9.0
github.com/tmc/langchaingo v0.1.12
golang.org/x/sync v0.7.0
)
require (

2
go.sum
View file

@ -117,6 +117,8 @@ golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=

View file

@ -12,8 +12,10 @@ import (
"os"
"path/filepath"
"strings"
"sync"
"github.com/gen2brain/go-fitz"
"golang.org/x/sync/errgroup"
)
// PaperlessClient struct to interact with the Paperless-NGX API
@ -307,37 +309,52 @@ func (c *PaperlessClient) DownloadDocumentAsImages(ctx context.Context, document
}
defer doc.Close()
var mu sync.Mutex
var g errgroup.Group
for n := 0; n < doc.NumPage(); n++ {
img, err := doc.Image(n)
if err != nil {
return nil, err
}
n := n // capture loop variable
g.Go(func() error {
img, err := doc.Image(n)
if err != nil {
return err
}
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
f, err := os.Create(imagePath)
if err != nil {
return nil, err
}
imagePath := filepath.Join(docDir, fmt.Sprintf("page%03d.jpg", n))
f, err := os.Create(imagePath)
if err != nil {
return err
}
err = jpeg.Encode(f, img, &jpeg.Options{Quality: jpeg.DefaultQuality})
if err != nil {
return nil, err
}
f.Close()
err = jpeg.Encode(f, img, &jpeg.Options{Quality: jpeg.DefaultQuality})
if err != nil {
f.Close()
return err
}
f.Close()
// Verify the JPEG file
file, err := os.Open(imagePath)
if err != nil {
return nil, err
}
defer file.Close()
// Verify the JPEG file
file, err := os.Open(imagePath)
if err != nil {
return err
}
defer file.Close()
_, err = jpeg.Decode(file)
if err != nil {
return nil, fmt.Errorf("invalid JPEG file: %s", imagePath)
}
_, err = jpeg.Decode(file)
if err != nil {
return fmt.Errorf("invalid JPEG file: %s", imagePath)
}
imagePaths = append(imagePaths, imagePath)
mu.Lock()
imagePaths = append(imagePaths, imagePath)
mu.Unlock()
return nil
})
}
if err := g.Wait(); err != nil {
return nil, err
}
return imagePaths, nil