Update ocr/google_docai_provider.go

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
This commit is contained in:
Icereed 2025-02-12 05:50:23 +01:00 committed by GitHub
parent f0a73ed263
commit eb37f27e1b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -165,6 +165,10 @@ func generateHOCR(doc *documentaipb.Document) string {
for pageNum, page := range doc.GetPages() { for pageNum, page := range doc.GetPages() {
pageWidth := page.GetDimension().GetWidth() pageWidth := page.GetDimension().GetWidth()
pageHeight := page.GetDimension().GetHeight() pageHeight := page.GetDimension().GetHeight()
// Validate dimensions
if pageWidth <= 0 || pageHeight <= 0 {
continue
}
hocr.WriteString(fmt.Sprintf(` hocr.WriteString(fmt.Sprintf(`
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`, <div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
@ -178,10 +182,18 @@ func generateHOCR(doc *documentaipb.Document) string {
} }
// Convert normalized coordinates to absolute // Convert normalized coordinates to absolute
x1 := int(paraBox[0].GetX() * pageWidth) // Use float64 for intermediate calculations to prevent overflow
y1 := int(paraBox[0].GetY() * pageHeight) x1 := int(float64(paraBox[0].GetX()) * float64(pageWidth))
x2 := int(paraBox[2].GetX() * pageWidth) y1 := int(float64(paraBox[0].GetY()) * float64(pageHeight))
y2 := int(paraBox[2].GetY() * pageHeight) x2 := int(float64(paraBox[2].GetX()) * float64(pageWidth))
y2 := int(float64(paraBox[2].GetY()) * float64(pageHeight))
// Validate coordinates
if x1 < 0 || y1 < 0 || x2 < 0 || y2 < 0 ||
x1 > int(pageWidth) || y1 > int(pageHeight) ||
x2 > int(pageWidth) || y2 > int(pageHeight) {
continue
}
hocr.WriteString(fmt.Sprintf(` hocr.WriteString(fmt.Sprintf(`
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`, <p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
@ -194,6 +206,9 @@ func generateHOCR(doc *documentaipb.Document) string {
continue continue
} }
// Escape HTML special characters
text = html.EscapeString(text)
hocr.WriteString(fmt.Sprintf(` hocr.WriteString(fmt.Sprintf(`
<span class='ocrx_word'>%s</span>`, text)) <span class='ocrx_word'>%s</span>`, text))
} }