package ocr import ( "regexp" "strings" "testing" "cloud.google.com/go/documentai/apiv1/documentaipb" ) func TestGenerateHOCR(t *testing.T) { tests := []struct { name string doc *documentaipb.Document expected string }{ { name: "empty document", doc: &documentaipb.Document{}, expected: "", }, { name: "single page with one paragraph", doc: &documentaipb.Document{ Text: "Hello World", Pages: []*documentaipb.Document_Page{ { Dimension: &documentaipb.Document_Page_Dimension{ Width: 800, Height: 600, }, Paragraphs: []*documentaipb.Document_Page_Paragraph{ { Layout: &documentaipb.Document_Page_Layout{ BoundingPoly: &documentaipb.BoundingPoly{ NormalizedVertices: []*documentaipb.NormalizedVertex{ {X: 0.1, Y: 0.1}, {X: 0.9, Y: 0.1}, {X: 0.9, Y: 0.2}, {X: 0.1, Y: 0.2}, }, }, TextAnchor: &documentaipb.Document_TextAnchor{ TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{ { StartIndex: 0, EndIndex: 11, }, }, }, }, }, }, }, }, }, expected: "(?s).*
.*" + "Hello World.*
.*