From 2b436a2ab2501508f9164b5c6a70ab9aeb026a74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Schr=C3=B6ter?= Date: Mon, 28 Oct 2024 15:02:18 +0100 Subject: [PATCH] pretty ui --- app_http_handlers.go | 2 + app_llm.go | 17 ++++++- jobs.go | 15 +++++- main.go | 31 ++++++++++- paperless.go | 6 +++ types.go | 1 + web-app/src/DocumentProcessor.tsx | 26 ++++++---- web-app/src/ExperimentalOCR.tsx | 85 +++++++++++++++++++++++++++---- 8 files changed, 161 insertions(+), 22 deletions(-) diff --git a/app_http_handlers.go b/app_http_handlers.go index 6b5c75c..53db4d7 100644 --- a/app_http_handlers.go +++ b/app_http_handlers.go @@ -189,6 +189,7 @@ func (app *App) getJobStatusHandler(c *gin.Context) { "status": job.Status, "created_at": job.CreatedAt, "updated_at": job.UpdatedAt, + "pages_done": job.PagesDone, } if job.Status == "completed" { @@ -210,6 +211,7 @@ func (app *App) getAllJobsHandler(c *gin.Context) { "status": job.Status, "created_at": job.CreatedAt, "updated_at": job.UpdatedAt, + "pages_done": job.PagesDone, } if job.Status == "completed" { diff --git a/app_llm.go b/app_llm.go index 38afd29..15fb79d 100644 --- a/app_llm.go +++ b/app_llm.go @@ -67,12 +67,27 @@ func (app *App) getSuggestedTags(ctx context.Context, content string, suggestedT } func (app *App) doOCRViaLLM(ctx context.Context, jpegBytes []byte) (string, error) { + + templateMutex.RLock() + defer templateMutex.RUnlock() + likelyLanguage := getLikelyLanguage() + + var promptBuffer bytes.Buffer + err := ocrTemplate.Execute(&promptBuffer, map[string]interface{}{ + "Language": likelyLanguage, + }) + if err != nil { + return "", fmt.Errorf("error executing tag template: %v", err) + } + + prompt := promptBuffer.String() + // Convert the image to text completion, err := app.VisionLLM.GenerateContent(ctx, []llms.MessageContent{ { Parts: []llms.ContentPart{ llms.BinaryPart("image/jpeg", jpegBytes), - llms.TextPart("Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format."), + llms.TextPart(prompt), }, Role: llms.ChatMessageTypeHuman, }, diff --git a/jobs.go b/jobs.go index 4ef9dc4..a2635a4 100644 --- a/jobs.go +++ b/jobs.go @@ -21,6 +21,7 @@ type Job struct { Result string // OCR result or error message CreatedAt time.Time UpdatedAt time.Time + PagesDone int // Number of pages processed } // JobStore manages jobs and their statuses @@ -44,6 +45,7 @@ func generateJobID() string { func (store *JobStore) addJob(job *Job) { store.Lock() defer store.Unlock() + job.PagesDone = 0 // Initialize PagesDone to 0 store.jobs[job.ID] = job logger.Printf("Job added: %v", job) } @@ -84,6 +86,16 @@ func (store *JobStore) updateJobStatus(jobID, status, result string) { } } +func (store *JobStore) updatePagesDone(jobID string, pagesDone int) { + store.Lock() + defer store.Unlock() + if job, exists := store.jobs[jobID]; exists { + job.PagesDone = pagesDone + job.UpdatedAt = time.Now() + logger.Printf("Job pages done updated: %v", job) + } +} + func startWorkerPool(app *App, numWorkers int) { for i := 0; i < numWorkers; i++ { go func(workerID int) { @@ -110,7 +122,7 @@ func processJob(app *App, job *Job) { } var ocrTexts []string - for _, imagePath := range imagePaths { + for i, imagePath := range imagePaths { imageContent, err := os.ReadFile(imagePath) if err != nil { logger.Printf("Error reading image file for job %s: %v", job.ID, err) @@ -126,6 +138,7 @@ func processJob(app *App, job *Job) { } ocrTexts = append(ocrTexts, ocrText) + jobStore.updatePagesDone(job.ID, i+1) // Update PagesDone after each page is processed } // Combine the OCR texts diff --git a/main.go b/main.go index df06bbd..bc1c419 100644 --- a/main.go +++ b/main.go @@ -34,6 +34,7 @@ var ( // Templates titleTemplate *template.Template tagTemplate *template.Template + ocrTemplate *template.Template templateMutex sync.RWMutex // Default templates @@ -59,6 +60,8 @@ Content: Please concisely select the {{.Language}} tags from the list above that best describe the document. Be very selective and only choose the most relevant tags since too many tags will make the document less discoverable. ` + + defaultOcrPrompt = `Just transcribe the text in this image and preserve the formatting and layout (high quality OCR). Do that for ALL the text in the image. Be thorough and pay attention. This is very important. The image is from a text document so be sure to continue until the bottom of the page. Thanks a lot! You tend to forget about some text in the image so please focus! Use markdown format.` ) // App struct to hold dependencies @@ -142,6 +145,12 @@ func main() { api.POST("/documents/:id/ocr", app.submitOCRJobHandler) api.GET("/jobs/ocr/:job_id", app.getJobStatusHandler) api.GET("/jobs/ocr", app.getAllJobsHandler) + + // Endpoint to see if user enabled OCR + api.GET("/experimental/ocr", func(c *gin.Context) { + enabled := isOcrEnabled() + c.JSON(http.StatusOK, gin.H{"enabled": enabled}) + }) } // Serve static files for the frontend under /assets @@ -163,6 +172,10 @@ func main() { } } +func isOcrEnabled() bool { + return visionLlmModel != "" && visionLlmProvider != "" +} + // validateEnvVars ensures all necessary environment variables are set func validateEnvVars() { if paperlessBaseURL == "" { @@ -278,6 +291,21 @@ func loadTemplates() { if err != nil { log.Fatalf("Failed to parse tag template: %v", err) } + + // Load OCR template + ocrTemplatePath := filepath.Join(promptsDir, "ocr_prompt.tmpl") + ocrTemplateContent, err := os.ReadFile(ocrTemplatePath) + if err != nil { + log.Printf("Could not read %s, using default template: %v", ocrTemplatePath, err) + ocrTemplateContent = []byte(defaultOcrPrompt) + if err := os.WriteFile(ocrTemplatePath, ocrTemplateContent, os.ModePerm); err != nil { + log.Fatalf("Failed to write default OCR template to disk: %v", err) + } + } + ocrTemplate, err = template.New("ocr").Funcs(sprig.FuncMap()).Parse(string(ocrTemplateContent)) + if err != nil { + log.Fatalf("Failed to parse OCR template: %v", err) + } } // createLLM creates the appropriate LLM client based on the provider @@ -325,6 +353,7 @@ func createVisionLLM() (llms.Model, error) { ollama.WithServerURL(host), ) default: - return nil, fmt.Errorf("unsupported LLM provider: %s", llmProvider) + log.Printf("No Vision LLM provider created: %s", visionLlmProvider) + return nil, nil } } diff --git a/paperless.go b/paperless.go index 32cf1a3..03cc92a 100644 --- a/paperless.go +++ b/paperless.go @@ -222,6 +222,12 @@ func (c *PaperlessClient) UpdateDocuments(ctx context.Context, documents []Docum log.Printf("No valid title found for document %d, skipping.", documentID) } + // Suggested Content + suggestedContent := document.SuggestedContent + if suggestedContent != "" { + updatedFields["content"] = suggestedContent + } + // Marshal updated fields to JSON jsonData, err := json.Marshal(updatedFields) if err != nil { diff --git a/types.go b/types.go index 0320027..d0bb637 100644 --- a/types.go +++ b/types.go @@ -58,4 +58,5 @@ type DocumentSuggestion struct { OriginalDocument Document `json:"original_document"` SuggestedTitle string `json:"suggested_title,omitempty"` SuggestedTags []string `json:"suggested_tags,omitempty"` + SuggestedContent string `json:"suggested_content,omitempty"` } diff --git a/web-app/src/DocumentProcessor.tsx b/web-app/src/DocumentProcessor.tsx index cdc7c4a..d83709c 100644 --- a/web-app/src/DocumentProcessor.tsx +++ b/web-app/src/DocumentProcessor.tsx @@ -25,6 +25,7 @@ export interface DocumentSuggestion { original_document: Document; suggested_title?: string; suggested_tags?: string[]; + suggested_content?: string; } export interface TagOption { @@ -45,17 +46,22 @@ const DocumentProcessor: React.FC = () => { const [generateTags, setGenerateTags] = useState(true); const [error, setError] = useState(null); + // Temporary feature flags + const [ocrEnabled, setOcrEnabled] = useState(false); + // Custom hook to fetch initial data const fetchInitialData = useCallback(async () => { try { - const [filterTagRes, documentsRes, tagsRes] = await Promise.all([ + const [filterTagRes, documentsRes, tagsRes, ocrEnabledRes] = await Promise.all([ axios.get<{ tag: string }>("/api/filter-tag"), axios.get("/api/documents"), axios.get>("/api/tags"), + axios.get<{enabled: boolean}>("/api/experimental/ocr"), ]); setFilterTag(filterTagRes.data.tag); setDocuments(documentsRes.data); + setOcrEnabled(ocrEnabledRes.data.enabled); const tags = Object.keys(tagsRes.data).map((tag) => ({ id: tag, name: tag, @@ -193,14 +199,16 @@ const DocumentProcessor: React.FC = () => {

Paperless GPT

-
- - OCR via LLMs (Experimental) - -
+ {ocrEnabled && ( +
+ + OCR via LLMs (Experimental) + +
+ )}
{error && ( diff --git a/web-app/src/ExperimentalOCR.tsx b/web-app/src/ExperimentalOCR.tsx index de60cc5..bdf66f0 100644 --- a/web-app/src/ExperimentalOCR.tsx +++ b/web-app/src/ExperimentalOCR.tsx @@ -1,21 +1,25 @@ -// ExperimentalOCR.tsx import axios from 'axios'; -import React, { useState } from 'react'; +import React, { useCallback, useEffect, useState } from 'react'; import { FaSpinner } from 'react-icons/fa'; - +import { Document, DocumentSuggestion } from './DocumentProcessor'; const ExperimentalOCR: React.FC = () => { - const [documentId, setDocumentId] = useState(''); + const refreshInterval = 1000; // Refresh interval in milliseconds + const [documentId, setDocumentId] = useState(0); const [jobId, setJobId] = useState(''); const [ocrResult, setOcrResult] = useState(''); const [status, setStatus] = useState(''); - const [error, setError] = useState(''); + const [error, setError] = useState(''); + const [pagesDone, setPagesDone] = useState(0); // New state for pages done + const [saving, setSaving] = useState(false); // New state for saving + const [documentDetails, setDocumentDetails] = useState(null); // New state for document details const submitOCRJob = async () => { setStatus(''); setError(''); setJobId(''); setOcrResult(''); + setPagesDone(0); // Reset pages done try { setStatus('Submitting OCR job...'); @@ -34,6 +38,7 @@ const ExperimentalOCR: React.FC = () => { try { const response = await axios.get(`/api/jobs/ocr/${jobId}`); const jobStatus = response.data.status; + setPagesDone(response.data.pages_done); // Update pages done if (jobStatus === 'completed') { setOcrResult(response.data.result); setStatus('OCR completed successfully.'); @@ -43,7 +48,7 @@ const ExperimentalOCR: React.FC = () => { } else { setStatus(`Job status: ${jobStatus}. This may take a few minutes.`); // Automatically check again after a delay - setTimeout(checkJobStatus, 5000); + setTimeout(checkJobStatus, refreshInterval); } } catch (err) { console.error(err); @@ -51,8 +56,49 @@ const ExperimentalOCR: React.FC = () => { } }; + const handleSaveContent = async () => { + setSaving(true); + setError(null); + try { + if (!documentDetails) { + setError('Document details not fetched.'); + throw new Error('Document details not fetched.'); + } + const requestPayload: DocumentSuggestion = { + id: documentId, + original_document: documentDetails, // Use fetched document details + suggested_content: ocrResult, + }; + + await axios.post("/api/save-content", requestPayload); + setStatus('Content saved successfully.'); + } catch (err) { + console.error("Error saving content:", err); + setError("Failed to save content."); + } finally { + setSaving(false); + } + }; + + const fetchDocumentDetails = useCallback(async () => { + if (!documentId) return; + + try { + const response = await axios.get(`/api/documents/${documentId}`); + setDocumentDetails(response.data); + } catch (err) { + console.error("Error fetching document details:", err); + setError("Failed to fetch document details."); + } + }, [documentId]); + + // Fetch document details when documentId changes + useEffect(() => { + fetchDocumentDetails(); + }, [documentId, fetchDocumentDetails]); + // Start checking job status when jobId is set - React.useEffect(() => { + useEffect(() => { if (jobId) { checkJobStatus(); } @@ -71,10 +117,10 @@ const ExperimentalOCR: React.FC = () => { Document ID: setDocumentId(e.target.value)} + onChange={(e) => setDocumentId(Number(e.target.value))} className="border border-gray-300 dark:border-gray-700 rounded w-full p-2 focus:outline-none focus:ring-2 focus:ring-blue-500" placeholder="Enter the document ID" /> @@ -102,6 +148,11 @@ const ExperimentalOCR: React.FC = () => { )} {!status.includes('in_progress') && status} + {pagesDone > 0 && ( +
+ Pages processed: {pagesDone} +
+ )}
)} {error && ( @@ -115,6 +166,20 @@ const ExperimentalOCR: React.FC = () => {
{ocrResult}
+ )} @@ -122,4 +187,4 @@ const ExperimentalOCR: React.FC = () => { ); }; -export default ExperimentalOCR; +export default ExperimentalOCR; \ No newline at end of file