Dev news

Commit c0f1cefeb3 for aom

commit c0f1cefeb3a21617e163a846a99bd7a99ff86d13
Author: Satheesh Kumar <satheesh.kumar@ittiam.com>
Date:   Mon Feb 16 17:04:51 2026 +0530

    Move pred buffer allocation from stack to heap

    This patch moves the allocation of pred buffer used in
    upsampled_pref_error() and upsampled_obmc_pref_error() from stack
    memory to MACROBLOCK structure. This improves the performance due to
    better stack memory management. Also, the same heap allocated buffer
    is used as a temporary buffer in aom_upsampled_pred() and
    aom_highbd_upsampled_pred().

    Encoder performance results averaged over all resolutions
    are as follows:

          Instruction Count
    cpu     Reduction(%)
     1        1.339
     2        1.029

    This change is bit-exact for all presets.

    Change-Id: Ia3a9198f353080bc835bd08b464a2fa395de8474

diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index b95235fb7e..db912704d2 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -925,6 +925,11 @@ typedef struct macroblockd {
    * 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'.
    */
   uint8_t *tmp_obmc_bufs[2];
+
+  /*!
+   *  Temporary buffer used for upsampled prediction.
+   */
+  uint8_t *tmp_upsample_pred;
 } MACROBLOCKD;

 /*!\cond */
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 194eb396b5..6734f34e7c 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -957,6 +957,11 @@ typedef struct macroblock {
    *   prediction.
    */
   uint8_t *tmp_pred_bufs[2];
+
+  /*!
+   *  Buffer used for upsampled prediction.
+   */
+  uint8_t *upsample_pred;
   /**@}*/

   /*****************************************************************************
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 95baf9d907..bebb1f6718 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -879,6 +879,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
   const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
   FeatureFlags *const features = &cm->features;
+  const int is_highbitdepth = seq_params->use_highbitdepth;

   // in case of LAP, lag in frames is set according to number of lap buffers
   // calculated at init time. This stores and restores LAP's lag in frames to
@@ -976,6 +977,14 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
     }
   }

+  if (x->upsample_pred == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->upsample_pred,
+        aom_memalign(16, (1 + is_highbitdepth) * ((MAX_SB_SIZE + 16) + 16) *
+                             MAX_SB_SIZE * sizeof(*x->upsample_pred)));
+    x->e_mbd.tmp_upsample_pred = x->upsample_pred;
+  }
+
   av1_reset_segment_features(cm);

   av1_set_high_precision_mv(cpi, 1, 0);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 90f36b73a4..52bb9754c0 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1489,6 +1489,7 @@ typedef struct ThreadData {
   CONV_BUF_TYPE *tmp_conv_dst;
   uint64_t abs_sum_level;
   uint8_t *tmp_pred_bufs[2];
+  uint8_t *upsample_pred;
   uint8_t *wiener_tmp_pred_buf;
   int intrabc_used;
   int deltaq_used;
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 52b2a37606..33caa2ac9d 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -336,6 +336,7 @@ static inline void dealloc_compressor_data(AV1_COMP *cpi) {
   aom_free(cpi->td.mb.palette_buffer);
   release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
   aom_free(cpi->td.mb.tmp_conv_dst);
+  aom_free(cpi->td.mb.upsample_pred);
   for (int j = 0; j < 2; ++j) {
     aom_free(cpi->td.mb.tmp_pred_bufs[j]);
   }
@@ -478,6 +479,7 @@ static inline void free_thread_data(AV1_PRIMARY *ppi) {
     aom_free(td->tctx);
     aom_free(td->palette_buffer);
     aom_free(td->tmp_conv_dst);
+    aom_free(td->upsample_pred);
     release_compound_type_rd_buffers(&td->comp_rd_buffer);
     for (int j = 0; j < 2; ++j) {
       aom_free(td->tmp_pred_bufs[j]);
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index fad4f5119e..c10c662817 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -960,6 +960,7 @@ void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
   assert(p_mt_info->workers != NULL);
   assert(p_mt_info->tile_thr_data != NULL);

+  const int is_highbitdepth = ppi->seq_params.use_highbitdepth;
   int num_workers = p_mt_info->num_workers;
   int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC);
   assert(num_enc_workers <= num_workers);
@@ -989,6 +990,11 @@ void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
                              "Failed to allocate PICK_MODE_CONTEXT");
       }

+      AOM_CHECK_MEM_ERROR(
+          &ppi->error, td->upsample_pred,
+          aom_memalign(16, (1 + is_highbitdepth) * ((MAX_SB_SIZE + 16) + 16) *
+                               MAX_SB_SIZE * sizeof(*td->upsample_pred)));
+
       if (!is_first_pass && i < num_enc_workers) {
         // Set up sms_tree.
         if (av1_setup_sms_tree(ppi->cpi, td)) {
@@ -1630,6 +1636,7 @@ static inline void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
       thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      thread_data->td->mb.upsample_pred = thread_data->td->upsample_pred;
       for (int j = 0; j < 2; ++j) {
         thread_data->td->mb.tmp_pred_bufs[j] =
             thread_data->td->tmp_pred_bufs[j];
@@ -1641,6 +1648,8 @@ static inline void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
           thread_data->td->src_var_info_of_4x4_sub_blocks;

       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+      thread_data->td->mb.e_mbd.tmp_upsample_pred =
+          thread_data->td->mb.upsample_pred;
       for (int j = 0; j < 2; ++j) {
         thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
             thread_data->td->mb.tmp_pred_bufs[j];
@@ -2290,6 +2299,9 @@ static inline void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
       }
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+      thread_data->td->mb.upsample_pred = thread_data->td->upsample_pred;
+      thread_data->td->mb.e_mbd.tmp_upsample_pred =
+          thread_data->td->mb.upsample_pred;
     }
   }
 }
@@ -2466,6 +2478,9 @@ static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
         aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
                            "Error allocating temporal filter data");
       }
+      thread_data->td->mb.upsample_pred = thread_data->td->upsample_pred;
+      thread_data->td->mb.e_mbd.tmp_upsample_pred =
+          thread_data->td->mb.upsample_pred;
     }
   }
 }
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 06145ce1bc..b451a21ffe 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -2572,7 +2572,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
   unsigned int besterr;
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    uint16_t *pred16 = (uint16_t *)(xd->tmp_upsample_pred);
     uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
     if (second_pred != NULL) {
       if (mask) {
@@ -2593,7 +2593,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
     }
     besterr = vfp->vf(pred8, w, src, src_stride, sse);
   } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    uint8_t *pred = xd->tmp_upsample_pred;
     if (second_pred != NULL) {
       if (mask) {
         aom_comp_mask_upsampled_pred(
@@ -2614,7 +2614,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
     besterr = vfp->vf(pred, w, src, src_stride, sse);
   }
 #else
-  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  uint8_t *pred = xd->tmp_upsample_pred;
   if (second_pred != NULL) {
     if (mask) {
       aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
@@ -3682,21 +3682,23 @@ static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
   const int mi_col = xd->mi_col;

   unsigned int besterr;
-  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
-    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+    uint16_t *pred16 = (uint16_t *)(xd->tmp_upsample_pred);
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
     aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
                               subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
                               subpel_search_type);
     besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
   } else {
+    uint8_t *pred = xd->tmp_upsample_pred;
     aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
                        subpel_y_q3, ref, ref_stride, subpel_search_type);

     besterr = vfp->ovf(pred, w, wsrc, mask, sse);
   }
 #else
+  uint8_t *pred = xd->tmp_upsample_pred;
   aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
                      subpel_y_q3, ref, ref_stride, subpel_search_type);

diff --git a/av1/encoder/x86/reconinter_enc_sse2.c b/av1/encoder/x86/reconinter_enc_sse2.c
index a18e172b99..8ab566a719 100644
--- a/av1/encoder/x86/reconinter_enc_sse2.c
+++ b/av1/encoder/x86/reconinter_enc_sse2.c
@@ -126,8 +126,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
     aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
                        width, height);
   } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    uint8_t *temp = comp_pred;
     const int16_t *const kernel_x =
         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
     const int16_t *const kernel_y =
@@ -230,8 +229,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
     aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
                               kernel, 16, width, height, bd);
   } else {
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    uint16_t *temp = CONVERT_TO_SHORTPTR(comp_pred8);
     const int16_t *const kernel_x =
         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
     const int16_t *const kernel_y =
diff --git a/test/comp_mask_pred_test.cc b/test/comp_mask_pred_test.cc
index 7a23398d73..f57160ce0f 100644
--- a/test/comp_mask_pred_test.cc
+++ b/test/comp_mask_pred_test.cc
@@ -81,9 +81,11 @@ AV1CompMaskPredBase::~AV1CompMaskPredBase() = default;
 void AV1CompMaskPredBase::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   av1_init_wedge_masks();
-  comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  comp_pred1_ =
+      (uint8_t *)aom_memalign(16, ((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE);
   ASSERT_NE(comp_pred1_, nullptr);
-  comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  comp_pred2_ =
+      (uint8_t *)aom_memalign(16, ((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE);
   ASSERT_NE(comp_pred2_, nullptr);
   pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
   ASSERT_NE(pred_, nullptr);
@@ -455,11 +457,11 @@ void AV1HighbdCompMaskPredTestBase::SetUp() {
   rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   av1_init_wedge_masks();

-  comp_pred1_ =
-      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+  comp_pred1_ = (uint16_t *)aom_memalign(
+      16, ((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE * sizeof(*comp_pred1_));
   ASSERT_NE(comp_pred1_, nullptr);
-  comp_pred2_ =
-      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+  comp_pred2_ = (uint16_t *)aom_memalign(
+      16, ((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE * sizeof(*comp_pred2_));
   ASSERT_NE(comp_pred2_, nullptr);
   pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
   ASSERT_NE(pred_, nullptr);