paperless-gpt/ocr/google_docai_provider_test.go

package ocr

import (
	"regexp"
	"strings"
	"testing"

	"cloud.google.com/go/documentai/apiv1/documentaipb"
)

func TestGenerateHOCR(t *testing.T) {
	tests := []struct {
		name     string
		doc      *documentaipb.Document
		expected string
	}{
		{
			name:     "empty document",
			doc:      &documentaipb.Document{},
			expected: "",
		},
		{
			name: "single page with one paragraph",
			doc: &documentaipb.Document{
				Text: "Hello World",
				Pages: []*documentaipb.Document_Page{
					{
						Dimension: &documentaipb.Document_Page_Dimension{
							Width:  800,
							Height: 600,
						},
						Paragraphs: []*documentaipb.Document_Page_Paragraph{
							{
								Layout: &documentaipb.Document_Page_Layout{
									BoundingPoly: &documentaipb.BoundingPoly{
										NormalizedVertices: []*documentaipb.NormalizedVertex{
											{X: 0.1, Y: 0.1},
											{X: 0.9, Y: 0.1},
											{X: 0.9, Y: 0.2},
											{X: 0.1, Y: 0.2},
										},
									},
									TextAnchor: &documentaipb.Document_TextAnchor{
										TextSegments: []*documentaipb.Document_TextAnchor_TextSegment{
											{
												StartIndex: 0,
												EndIndex:   11,
											},
										},
									},
								},
							},
						},
					},
				},
			},
			expected: "(?s).*<div class='ocr_page' id='page_1' title='image;bbox 0 0 800 600'>.*" +
				"<p class='ocr_par' id='par_1_1' title='bbox 80 60 719 120'>.*" +
				"<span class='ocrx_word'>Hello World</span>.*</p>.*</div>.*",
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := generateHOCR(tt.doc)

			if tt.expected == "" {
				if result != "" {
					t.Errorf("expected empty string, got %v", result)
				}
				return
			}

			matched, err := regexp.MatchString(tt.expected, result)
			if err != nil {
				t.Fatalf("error matching regex: %v", err)
			}
			if !matched {
				t.Errorf("expected to match regex %v\ngot: %v", tt.expected, result)
			}

			// Verify basic hOCR structure
			if !strings.Contains(result, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") {
				t.Error("missing XML declaration")
			}
			if !strings.Contains(result, "<html xmlns=\"http://www.w3.org/1999/xhtml\"") {
				t.Error("missing HTML namespace")
			}
			if !strings.Contains(result, "<meta name='ocr-system' content='google-docai'") {
				t.Error("missing OCR system metadata")
			}
		})
	}
}