mirror of
https://github.com/icereed/paperless-gpt.git
synced 2025-03-12 21:08:00 -05:00
Update ocr/google_docai_provider.go
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
parent
f0a73ed263
commit
eb37f27e1b
1 changed files with 19 additions and 4 deletions
|
@ -165,6 +165,10 @@ func generateHOCR(doc *documentaipb.Document) string {
|
||||||
for pageNum, page := range doc.GetPages() {
|
for pageNum, page := range doc.GetPages() {
|
||||||
pageWidth := page.GetDimension().GetWidth()
|
pageWidth := page.GetDimension().GetWidth()
|
||||||
pageHeight := page.GetDimension().GetHeight()
|
pageHeight := page.GetDimension().GetHeight()
|
||||||
|
// Validate dimensions
|
||||||
|
if pageWidth <= 0 || pageHeight <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
hocr.WriteString(fmt.Sprintf(`
|
hocr.WriteString(fmt.Sprintf(`
|
||||||
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
|
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
|
||||||
|
@ -178,10 +182,18 @@ func generateHOCR(doc *documentaipb.Document) string {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert normalized coordinates to absolute
|
// Convert normalized coordinates to absolute
|
||||||
x1 := int(paraBox[0].GetX() * pageWidth)
|
// Use float64 for intermediate calculations to prevent overflow
|
||||||
y1 := int(paraBox[0].GetY() * pageHeight)
|
x1 := int(float64(paraBox[0].GetX()) * float64(pageWidth))
|
||||||
x2 := int(paraBox[2].GetX() * pageWidth)
|
y1 := int(float64(paraBox[0].GetY()) * float64(pageHeight))
|
||||||
y2 := int(paraBox[2].GetY() * pageHeight)
|
x2 := int(float64(paraBox[2].GetX()) * float64(pageWidth))
|
||||||
|
y2 := int(float64(paraBox[2].GetY()) * float64(pageHeight))
|
||||||
|
|
||||||
|
// Validate coordinates
|
||||||
|
if x1 < 0 || y1 < 0 || x2 < 0 || y2 < 0 ||
|
||||||
|
x1 > int(pageWidth) || y1 > int(pageHeight) ||
|
||||||
|
x2 > int(pageWidth) || y2 > int(pageHeight) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
hocr.WriteString(fmt.Sprintf(`
|
hocr.WriteString(fmt.Sprintf(`
|
||||||
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
|
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
|
||||||
|
@ -194,6 +206,9 @@ func generateHOCR(doc *documentaipb.Document) string {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Escape HTML special characters
|
||||||
|
text = html.EscapeString(text)
|
||||||
|
|
||||||
hocr.WriteString(fmt.Sprintf(`
|
hocr.WriteString(fmt.Sprintf(`
|
||||||
<span class='ocrx_word'>%s</span>`, text))
|
<span class='ocrx_word'>%s</span>`, text))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue