Dev news

Commit 098d8d75 for tesseract

commit 098d8d754e7adfd760e712209dae08a0ab42e41d
Author: Stefan Weil <sw@weilnetz.de>
Date:   Thu Jun 18 18:47:24 2026 +0200

    Remove goto statements from acceptable_word_string

    Replace the two goto statements in acceptable_word_string() with
    structured control flow by extracting the abbreviation checking logic
    into a separate check_abbreviation() helper function.

    This eliminates all goto statements from the function and makes
    the control flow clearer.

    Assisted-by: minimax-m2.7 (MiniMax)
    Signed-off-by: Stefan Weil <sw@weilnetz.de>

diff --git a/src/ccmain/control.cpp b/src/ccmain/control.cpp
index 454aa94f..b0847bc5 100644
--- a/src/ccmain/control.cpp
+++ b/src/ccmain/control.cpp
@@ -1729,7 +1729,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_se
       offset += lengths[i++];
     }
     if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
-      goto not_a_word;
+      return check_abbreviation(char_set, s, lengths, AC_UNACCEPTABLE);
     }
     /*
 Allow a single hyphen in a lower case word
@@ -1743,7 +1743,7 @@ Allow a single hyphen in a lower case word
           offset += lengths[i++];
         }
         if (i < hyphen_pos + 3) {
-          goto not_a_word;
+          return check_abbreviation(char_set, s, lengths, AC_UNACCEPTABLE);
         }
       }
     } else {
@@ -1774,32 +1774,34 @@ Allow a single hyphen in a lower case word
     word_type = AC_UNACCEPTABLE;
   }

-not_a_word:
+  return check_abbreviation(char_set, s, lengths, word_type);
+}

+ACCEPTABLE_WERD_TYPE Tesseract::check_abbreviation(const UNICHARSET &char_set, const char *s,
+                                                   const char *lengths,
+                                                   ACCEPTABLE_WERD_TYPE word_type) {
   if (word_type == AC_UNACCEPTABLE) {
     /* Look for abbreviation string */
-    i = 0;
-    offset = 0;
+    int offset = 0;
     if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
       word_type = AC_UC_ABBREV;
-      while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
-             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
-        offset += lengths[i++];
-        offset += lengths[i++];
+      while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[offset]) &&
+             lengths[offset + 1] == 1 && s[offset + lengths[offset]] == '.') {
+        offset += lengths[offset++];
+        offset += lengths[offset++];
       }
     } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
       word_type = AC_LC_ABBREV;
-      while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
-             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
-        offset += lengths[i++];
-        offset += lengths[i++];
+      while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[offset]) &&
+             lengths[offset + 1] == 1 && s[offset + lengths[offset]] == '.') {
+        offset += lengths[offset++];
+        offset += lengths[offset++];
       }
     }
     if (s[offset] != '\0') {
       word_type = AC_UNACCEPTABLE;
     }
   }
-
   return word_type;
 }

diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
index 42f8febc..1dc76abd 100644
--- a/src/ccmain/tesseractclass.h
+++ b/src/ccmain/tesseractclass.h
@@ -445,6 +445,8 @@ public:

   ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s,
                                               const char *lengths);
+  ACCEPTABLE_WERD_TYPE check_abbreviation(const UNICHARSET &char_set, const char *s,
+                                           const char *lengths, ACCEPTABLE_WERD_TYPE word_type);
   void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block);
   void classify_word_pass2(const WordData &word_data, WERD_RES **in_word,
                            PointerVector<WERD_RES> *out_words);