Commit 098d8d75 for tesseract
commit 098d8d754e7adfd760e712209dae08a0ab42e41d
Author: Stefan Weil <sw@weilnetz.de>
Date: Thu Jun 18 18:47:24 2026 +0200
Remove goto statements from acceptable_word_string
Replace the two goto statements in acceptable_word_string() with
structured control flow by extracting the abbreviation checking logic
into a separate check_abbreviation() helper function.
This eliminates all goto statements from the function and makes
the control flow clearer.
Assisted-by: minimax-m2.7 (MiniMax)
Signed-off-by: Stefan Weil <sw@weilnetz.de>
diff --git a/src/ccmain/control.cpp b/src/ccmain/control.cpp
index 454aa94f..b0847bc5 100644
--- a/src/ccmain/control.cpp
+++ b/src/ccmain/control.cpp
@@ -1729,7 +1729,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_se
offset += lengths[i++];
}
if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
- goto not_a_word;
+ return check_abbreviation(char_set, s, lengths, AC_UNACCEPTABLE);
}
/*
Allow a single hyphen in a lower case word
@@ -1743,7 +1743,7 @@ Allow a single hyphen in a lower case word
offset += lengths[i++];
}
if (i < hyphen_pos + 3) {
- goto not_a_word;
+ return check_abbreviation(char_set, s, lengths, AC_UNACCEPTABLE);
}
}
} else {
@@ -1774,32 +1774,34 @@ Allow a single hyphen in a lower case word
word_type = AC_UNACCEPTABLE;
}
-not_a_word:
+ return check_abbreviation(char_set, s, lengths, word_type);
+}
+ACCEPTABLE_WERD_TYPE Tesseract::check_abbreviation(const UNICHARSET &char_set, const char *s,
+ const char *lengths,
+ ACCEPTABLE_WERD_TYPE word_type) {
if (word_type == AC_UNACCEPTABLE) {
/* Look for abbreviation string */
- i = 0;
- offset = 0;
+ int offset = 0;
if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
word_type = AC_UC_ABBREV;
- while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
- lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
- offset += lengths[i++];
- offset += lengths[i++];
+ while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[offset]) &&
+ lengths[offset + 1] == 1 && s[offset + lengths[offset]] == '.') {
+ offset += lengths[offset++];
+ offset += lengths[offset++];
}
} else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
word_type = AC_LC_ABBREV;
- while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
- lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
- offset += lengths[i++];
- offset += lengths[i++];
+ while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[offset]) &&
+ lengths[offset + 1] == 1 && s[offset + lengths[offset]] == '.') {
+ offset += lengths[offset++];
+ offset += lengths[offset++];
}
}
if (s[offset] != '\0') {
word_type = AC_UNACCEPTABLE;
}
}
-
return word_type;
}
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
index 42f8febc..1dc76abd 100644
--- a/src/ccmain/tesseractclass.h
+++ b/src/ccmain/tesseractclass.h
@@ -445,6 +445,8 @@ public:
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s,
const char *lengths);
+ ACCEPTABLE_WERD_TYPE check_abbreviation(const UNICHARSET &char_set, const char *s,
+ const char *lengths, ACCEPTABLE_WERD_TYPE word_type);
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block);
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word,
PointerVector<WERD_RES> *out_words);