Commit bafb1fc7 for tesseract
commit bafb1fc77ca53097e4b40e6795cdcf71db4c5287
Author: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Date: Sun Feb 8 18:43:44 2026 +0000
Add CI test for PAGE XML multi-page closing tags
Co-authored-by: Stefan Weil <sw@weilnetz.de>
diff --git a/unittest/baseapi_test.cc b/unittest/baseapi_test.cc
index 4808fb3b..9aa47024 100644
--- a/unittest/baseapi_test.cc
+++ b/unittest/baseapi_test.cc
@@ -395,4 +395,80 @@ TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {
}
}
+// Test that PAGE XML output properly closes all Page tags for multi-page documents.
+TEST_F(TesseractTest, PAGEXMLMultiPageClosingTags) {
+ tesseract::TessBaseAPI api;
+ if (api.Init(TessdataPath().c_str(), "eng") == -1) {
+ GTEST_SKIP();
+ }
+
+ // Simulate two pages by calling GetPAGEText twice
+ Image src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
+ CHECK(src_pix);
+ api.SetInputName("page1.tif");
+ api.SetImage(src_pix);
+
+ char *page1 = api.GetPAGEText(0);
+ ASSERT_TRUE(page1 != nullptr);
+
+ // Each page should have exactly one opening and one closing Page tag
+ std::string page1_str(page1);
+ size_t open_count = 0;
+ size_t close_count = 0;
+ size_t pos = 0;
+
+ // Count opening <Page tags
+ while ((pos = page1_str.find("<Page", pos)) != std::string::npos) {
+ open_count++;
+ pos += 5;
+ }
+
+ // Count closing </Page> tags
+ pos = 0;
+ while ((pos = page1_str.find("</Page>", pos)) != std::string::npos) {
+ close_count++;
+ pos += 7;
+ }
+
+ // Each individual page output should have matching Page tags
+ EXPECT_EQ(open_count, 1) << "Each page should have exactly one opening <Page tag";
+ EXPECT_EQ(close_count, 1) << "Each page should have exactly one closing </Page> tag";
+ EXPECT_EQ(open_count, close_count) << "Opening and closing Page tags should match";
+
+ // Verify the closing tag is present and not part of PcGts
+ EXPECT_THAT(page1_str, HasSubstr("</Page>"));
+ EXPECT_THAT(page1_str, ::testing::Not(HasSubstr("</PcGts>")))
+ << "Individual page output should not contain document envelope";
+
+ delete[] page1;
+
+ // Test a second page to ensure each page closes properly
+ api.SetInputName("page2.tif");
+ api.SetImage(src_pix);
+ char *page2 = api.GetPAGEText(1);
+ ASSERT_TRUE(page2 != nullptr);
+
+ std::string page2_str(page2);
+ open_count = 0;
+ close_count = 0;
+ pos = 0;
+
+ while ((pos = page2_str.find("<Page", pos)) != std::string::npos) {
+ open_count++;
+ pos += 5;
+ }
+
+ pos = 0;
+ while ((pos = page2_str.find("</Page>", pos)) != std::string::npos) {
+ close_count++;
+ pos += 7;
+ }
+
+ EXPECT_EQ(open_count, 1) << "Second page should have exactly one opening <Page tag";
+ EXPECT_EQ(close_count, 1) << "Second page should have exactly one closing </Page> tag";
+
+ delete[] page2;
+ src_pix.destroy();
+}
+
} // namespace tesseract