Dev news

Commit 70948280 for tesseract

commit 709482807a11e0fe9bd48d91dbec3d23de486542
Author: Stefan Weil <sw@weilnetz.de>
Date:   Thu Jun 4 17:46:53 2026 +0200

    Replace C-style arrays with std::array

    Assisted-by: OpenCode / BigPickle
    Signed-off-by: Stefan Weil <sw@weilnetz.de>

diff --git a/src/ccstruct/blobbox.h b/src/ccstruct/blobbox.h
index f25d00fe..497da78b 100644
--- a/src/ccstruct/blobbox.h
+++ b/src/ccstruct/blobbox.h
@@ -35,6 +35,8 @@
 #include "werd.h"       // for WERD_LIST

 #include <cinttypes> // for PRId32
+
+#include <array>
 #include <cmath>     // for std::sqrt
 #include <cstdint>   // for int16_t, int32_t

@@ -537,11 +539,11 @@ private:
   int32_t line_crossings_;                   // Number of line intersections touched.
   BLOBNBOX *base_char_blob_;                 // The blob that was the base char.
   tesseract::ColPartition *owner_;           // Who will delete me when I am not needed
-  BLOBNBOX *neighbours_[BND_COUNT];
+  std::array<BLOBNBOX *, BND_COUNT> neighbours_;
   float horz_stroke_width_ = 0.0f; // Median horizontal stroke width
   float vert_stroke_width_ = 0.0f; // Median vertical stroke width
   float area_stroke_width_ = 0.0f; // Stroke width from area/perimeter ratio.
-  bool good_stroke_neighbours_[BND_COUNT];
+  std::array<bool, BND_COUNT> good_stroke_neighbours_;
   bool horz_possible_;   // Could be part of horizontal flow.
   bool vert_possible_;   // Could be part of vertical flow.
   bool leader_on_left_;  // There is a leader to the left.
diff --git a/src/ccstruct/coutln.cpp b/src/ccstruct/coutln.cpp
index 55221d13..0e81ad8a 100644
--- a/src/ccstruct/coutln.cpp
+++ b/src/ccstruct/coutln.cpp
@@ -42,7 +42,7 @@

 namespace tesseract {

-ICOORD C_OUTLINE::step_coords[4] = {ICOORD(-1, 0), ICOORD(0, -1), ICOORD(1, 0), ICOORD(0, 1)};
+std::array<ICOORD, 4> C_OUTLINE::step_coords = {ICOORD(-1, 0), ICOORD(0, -1), ICOORD(1, 0), ICOORD(0, 1)};

 /**
  * @name C_OUTLINE::C_OUTLINE
diff --git a/src/ccstruct/coutln.h b/src/ccstruct/coutln.h
index fd08fd6d..043de38e 100644
--- a/src/ccstruct/coutln.h
+++ b/src/ccstruct/coutln.h
@@ -28,6 +28,8 @@
 #include <tesseract/export.h> // for DLLSYM

 #include <cstdint> // for int16_t, int32_t
+
+#include <array>
 #include <bitset>  // for std::bitset<16>

 struct Pix;
@@ -289,7 +291,7 @@ private:
   std::vector<uint8_t> steps; // step array
   EdgeOffset *offsets;     // Higher precision edge.
   C_OUTLINE_LIST children; // child elements
-  static ICOORD step_coords[4];
+  static std::array<ICOORD, 4> step_coords;
 };

 } // namespace tesseract
diff --git a/src/ccstruct/seam.h b/src/ccstruct/seam.h
index 73acaea7..7859780b 100644
--- a/src/ccstruct/seam.h
+++ b/src/ccstruct/seam.h
@@ -26,6 +26,8 @@
 #include "blobs.h"
 #include "split.h"

+#include <array>
+
 namespace tesseract {

 using PRIORITY = float; /*  PRIORITY  */
@@ -196,7 +198,7 @@ private:
   // Number of splits_ that are used.
   uint8_t num_splits_;
   // Set of pairs of points that are the ends of each split in the SEAM.
-  SPLIT splits_[kMaxNumSplits];
+  std::array<SPLIT, kMaxNumSplits> splits_;
 };

 void start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array);
diff --git a/src/ccutil/ambigs.cpp b/src/ccutil/ambigs.cpp
index 5aafedb1..4fff07b6 100644
--- a/src/ccutil/ambigs.cpp
+++ b/src/ccutil/ambigs.cpp
@@ -36,8 +36,8 @@ static const char kIllegalUnicharMsg[] = "Illegal unichar %s in ambiguity specif

 // Maximum line size:
 //   10 for sizes of ambigs, tabs, abmig type and newline
-//   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
-const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
+//   UNICHAR_LEN * (kMaxAmbigSize + 1) for each part of the ambig
+const int kMaxAmbigStringSize = UNICHAR_LEN * (kMaxAmbigSize + 1);

 AmbigSpec::AmbigSpec() : correct_ngram_id(INVALID_UNICHAR_ID), type(NOT_AMBIG), wrong_ngram_size(0) {
   wrong_ngram[0] = INVALID_UNICHAR_ID;
@@ -81,7 +81,7 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambi
   const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
   char *buffer = new char[kBufferSize];
   char replacement_string[kMaxAmbigStringSize];
-  UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];
+  std::array<UNICHAR_ID, kMaxAmbigSize + 1> test_unichar_ids;
   int line_num = 0;
   int type = NOT_AMBIG;

@@ -101,14 +101,14 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambi
     }
     ++line_num;
     if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, buffer,
-                            &test_ambig_part_size, test_unichar_ids, &replacement_ambig_part_size,
+                            &test_ambig_part_size, test_unichar_ids.data(), &replacement_ambig_part_size,
                             replacement_string, &type)) {
       continue;
     }
     // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
     auto *ambig_spec = new AmbigSpec();
     if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
-                         test_ambig_part_size, test_unichar_ids, replacement_ambig_part_size,
+                         test_ambig_part_size, test_unichar_ids.data(), replacement_ambig_part_size,
                          replacement_string, type, ambig_spec, unicharset)) {
       continue;
     }
@@ -188,9 +188,9 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambi
         for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
           AmbigSpec *ambig_spec = lst_it.data();
           tprintf("wrong_ngram:");
-          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
+          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram.data(), *unicharset);
           tprintf("correct_fragments:");
-          UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
+          UnicharIdArrayUtils::print(ambig_spec->correct_fragments.data(), *unicharset);
         }
       }
     }
@@ -235,7 +235,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_leve
       return false;
     }
     *test_ambig_part_size = unichars.size();
-    if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
+    if (*test_ambig_part_size > kMaxAmbigSize) {
       if (debug_level) {
         tprintf("Too many unichars in ambiguity on line %d\n", line_num);
       }
@@ -251,7 +251,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_leve
       return false;
     }
     *replacement_ambig_part_size = unichars.size();
-    if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
+    if (*replacement_ambig_part_size > kMaxAmbigSize) {
       if (debug_level) {
         tprintf("Too many unichars in ambiguity on line %d\n", line_num);
       }
@@ -276,7 +276,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_leve
     }
     return false;
   }
-  if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
+  if (*test_ambig_part_size > kMaxAmbigSize) {
     if (debug_level) {
       tprintf("Too many unichars in ambiguity on line %d\n", line_num);
     }
@@ -304,7 +304,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_leve
     }
     return false;
   }
-  if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
+  if (*replacement_ambig_part_size > kMaxAmbigSize) {
     if (debug_level) {
       tprintf("Too many unichars in ambiguity on line %d\n", line_num);
     }
@@ -362,7 +362,7 @@ bool UnicharAmbigs::InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_p
   }

   ambig_spec->wrong_ngram_size =
-      UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram);
+      UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram.data());

   // Since we need to maintain a constant number of unichar positions in
   // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
diff --git a/src/ccutil/ambigs.h b/src/ccutil/ambigs.h
index effedbf7..b80e7db4 100644
--- a/src/ccutil/ambigs.h
+++ b/src/ccutil/ambigs.h
@@ -31,10 +31,12 @@
 #  include "tprintf.h"
 #  include "unicharset.h"

-#  define MAX_AMBIG_SIZE 10
+#  include <array>

 namespace tesseract {

+constexpr int kMaxAmbigSize = 10;
+
 using UnicharIdVector = std::vector<UNICHAR_ID>;

 enum AmbigType {
@@ -118,15 +120,15 @@ public:
   // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
   // in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
   static int compare_ambig_specs(const AmbigSpec *s1, const AmbigSpec *s2) {
-    int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
+    int result = UnicharIdArrayUtils::compare(s1->wrong_ngram.data(), s2->wrong_ngram.data());
     if (result != 0) {
       return result;
     }
-    return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments);
+    return UnicharIdArrayUtils::compare(s1->correct_fragments.data(), s2->correct_fragments.data());
   }

-  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
-  UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
+  std::array<UNICHAR_ID, kMaxAmbigSize + 1> wrong_ngram;
+  std::array<UNICHAR_ID, kMaxAmbigSize + 1> correct_fragments;
   UNICHAR_ID correct_ngram_id;
   AmbigType type;
   int wrong_ngram_size;
diff --git a/src/ccutil/unicharcompress.h b/src/ccutil/unicharcompress.h
index 2e81bbde..45889d26 100644
--- a/src/ccutil/unicharcompress.h
+++ b/src/ccutil/unicharcompress.h
@@ -23,6 +23,9 @@

 #include <unordered_map>
 #include <vector>
+
+#include <array>
+
 #include "serialis.h"
 #include "unicharset.h"

@@ -35,7 +38,7 @@ public:
   static const int kMaxCodeLen = 9;

   RecodedCharID() : self_normalized_(1), length_(0) {
-    memset(code_, 0, sizeof(code_));
+    code_.fill(0);
   }
   void Truncate(int length) {
     length_ = length;
@@ -105,7 +108,7 @@ private:
   // The number of elements in use in code_;
   int32_t length_;
   // The re-encoded form of the unichar-id to which this RecodedCharID relates.
-  int32_t code_[kMaxCodeLen];
+  std::array<int32_t, kMaxCodeLen> code_;
 };

 // Class holds a "compression" of a unicharset to simplify the learning problem
diff --git a/src/classify/trainingsample.cpp b/src/classify/trainingsample.cpp
index f08c1a2b..b0f83008 100644
--- a/src/classify/trainingsample.cpp
+++ b/src/classify/trainingsample.cpp
@@ -36,8 +36,8 @@ namespace tesseract {
 const int kRandomizingCenter = 128;

 // Randomizing factors.
-const int TrainingSample::kYShiftValues[kSampleYShiftSize] = {6, 3, -3, -6, 0};
-const double TrainingSample::kScaleValues[kSampleScaleSize] = {1.0625, 0.9375, 1.0};
+const std::array<int, kSampleYShiftSize> TrainingSample::kYShiftValues = {6, 3, -3, -6, 0};
+const std::array<double, kSampleScaleSize> TrainingSample::kScaleValues = {1.0625, 0.9375, 1.0};

 TrainingSample::~TrainingSample() {
   delete[] features_;
@@ -77,10 +77,10 @@ bool TrainingSample::Serialize(FILE *fp) const {
       num_micro_features_) {
     return false;
   }
-  if (fwrite(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) != kNumCNParams) {
+  if (fwrite(cn_feature_.data(), sizeof(cn_feature_[0]), kNumCNParams, fp) != kNumCNParams) {
     return false;
   }
-  if (fwrite(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount) {
+  if (fwrite(geo_feature_.data(), sizeof(geo_feature_[0]), GeoCount, fp) != GeoCount) {
     return false;
   }
   return true;
@@ -145,10 +145,10 @@ bool TrainingSample::DeSerialize(bool swap, FILE *fp) {
       num_micro_features_) {
     return false;
   }
-  if (fread(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) != kNumCNParams) {
+  if (fread(cn_feature_.data(), sizeof(cn_feature_[0]), kNumCNParams, fp) != kNumCNParams) {
     return false;
   }
-  if (fread(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount) {
+  if (fread(geo_feature_.data(), sizeof(geo_feature_[0]), GeoCount, fp) != GeoCount) {
     return false;
   }
   return true;
@@ -227,8 +227,8 @@ TrainingSample *TrainingSample::Copy() const {
     memcpy(sample->micro_features_, micro_features_,
            num_micro_features_ * sizeof(micro_features_[0]));
   }
-  memcpy(sample->cn_feature_, cn_feature_, sizeof(*cn_feature_) * kNumCNParams);
-  memcpy(sample->geo_feature_, geo_feature_, sizeof(*geo_feature_) * GeoCount);
+  memcpy(sample->cn_feature_.data(), cn_feature_.data(), sizeof(cn_feature_[0]) * kNumCNParams);
+  memcpy(sample->geo_feature_.data(), geo_feature_.data(), sizeof(geo_feature_[0]) * GeoCount);
   return sample;
 }

diff --git a/src/classify/trainingsample.h b/src/classify/trainingsample.h
index 211ab669..12cb0fa8 100644
--- a/src/classify/trainingsample.h
+++ b/src/classify/trainingsample.h
@@ -27,6 +27,8 @@
 #include "shapetable.h"
 #include "unicharset.h"

+#include <array>
+
 struct Pix;

 namespace tesseract {
@@ -221,10 +223,10 @@ private:
   // Array of features.
   MicroFeature *micro_features_;
   // The one and only CN feature. Indexed by NORM_PARAM_NAME enum.
-  float cn_feature_[kNumCNParams];
+  std::array<float, kNumCNParams> cn_feature_;
   // The one and only geometric feature. (Aims at replacing cn_feature_).
   // Indexed by GeoParams enum in picofeat.h
-  int geo_feature_[GeoCount];
+  std::array<int, GeoCount> geo_feature_;

   // Non-serialized cache data.
   // Weight used for boosting training.
@@ -249,8 +251,8 @@ private:
   bool is_error_;

   // Randomizing factors.
-  static const int kYShiftValues[kSampleYShiftSize];
-  static const double kScaleValues[kSampleScaleSize];
+  static const std::array<int, kSampleYShiftSize> kYShiftValues;
+  static const std::array<double, kSampleScaleSize> kScaleValues;
 };

 ELISTIZEH(TrainingSample)
diff --git a/src/dict/stopper.cpp b/src/dict/stopper.cpp
index a1885daf..590d6cc7 100644
--- a/src/dict/stopper.cpp
+++ b/src/dict/stopper.cpp
@@ -198,7 +198,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
         ambig_blob_choices.push_back(lst);
       }
     }
-    UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
+    UNICHAR_ID wrong_ngram[kMaxAmbigSize + 1];
     int wrong_ngram_index;
     int blob_index = 0;
     for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
@@ -218,12 +218,12 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
       for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
         const AmbigSpec *ambig_spec = spec_it.data();
         wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;
-        int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram);
+        int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram.data());
         if (stopper_debug_level > 2) {
           tprintf("candidate ngram: ");
           UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
           tprintf("current ngram from spec: ");
-          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
+          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram.data(), getUnicharset());
           tprintf("comparison result: %d\n", compare);
         }
         if (compare == 0) {
@@ -244,7 +244,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
             if (stopper_debug_level > 2) {
               tprintf("replace ambiguity with %s : ",
                       getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id));
-              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
+              UnicharIdArrayUtils::print(ambig_spec->correct_fragments.data(), getUnicharset());
             }
             ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice,
                          ratings);
@@ -252,7 +252,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
             // We found dang ambig - update ambig_blob_choices.
             if (stopper_debug_level > 2) {
               tprintf("found ambiguity: ");
-              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
+              UnicharIdArrayUtils::print(ambig_spec->correct_fragments.data(), getUnicharset());
             }
             ambigs_found = true;
             for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {
diff --git a/src/wordrec/lm_consistency.h b/src/wordrec/lm_consistency.h
index e9ae2fc9..3bd30e8d 100644
--- a/src/wordrec/lm_consistency.h
+++ b/src/wordrec/lm_consistency.h
@@ -20,7 +20,9 @@
 #ifndef TESSERACT_WORDREC_LM_CONSISTENCY_H_
 #define TESSERACT_WORDREC_LM_CONSISTENCY_H_

+#include <array>
 #include <cstdint> // for INT16_MAX
+
 #include "dawg.h"  // for EDGE_REF, NO_EDGE
 #include "dict.h"  // for XH_GOOD, XH_INCONSISTENT, XHeightConsi...

@@ -128,10 +130,10 @@ struct LMConsistencyInfo {
   int script_id;
   int num_inconsistent_spaces;
   // Metrics clumped by position.
-  float xht_lo[kNumPos];
-  float xht_hi[kNumPos];
-  int16_t xht_count[kNumPos];
-  int16_t xht_count_punc[kNumPos];
+  std::array<float, kNumPos> xht_lo;
+  std::array<float, kNumPos> xht_hi;
+  std::array<int16_t, kNumPos> xht_count;
+  std::array<int16_t, kNumPos> xht_count_punc;
   int16_t xht_sp;
   int16_t xpos_entropy;
   bool invalid_punc;