Commit c0f1cefeb3 for aom
commit c0f1cefeb3a21617e163a846a99bd7a99ff86d13
Author: Satheesh Kumar <satheesh.kumar@ittiam.com>
Date: Mon Feb 16 17:04:51 2026 +0530
Move pred buffer allocation from stack to heap
This patch moves the allocation of pred buffer used in
upsampled_pref_error() and upsampled_obmc_pref_error() from stack
memory to MACROBLOCK structure. This improves the performance due to
better stack memory management. Also, the same heap allocated buffer
is used as a temporary buffer in aom_upsampled_pred() and
aom_highbd_upsampled_pred().
Encoder performance results averaged over all resolutions
are as follows:
Instruction Count
cpu Reduction(%)
1 1.339
2 1.029
This change is bit-exact for all presets.
Change-Id: Ia3a9198f353080bc835bd08b464a2fa395de8474
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index b95235fb7e..db912704d2 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -925,6 +925,11 @@ typedef struct macroblockd {
* 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'.
*/
uint8_t *tmp_obmc_bufs[2];
+
+ /*!
+ * Temporary buffer used for upsampled prediction.
+ */
+ uint8_t *tmp_upsample_pred;
} MACROBLOCKD;
/*!\cond */
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 194eb396b5..6734f34e7c 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -957,6 +957,11 @@ typedef struct macroblock {
* prediction.
*/
uint8_t *tmp_pred_bufs[2];
+
+ /*!
+ * Buffer used for upsampled prediction.
+ */
+ uint8_t *upsample_pred;
/**@}*/
/*****************************************************************************
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 95baf9d907..bebb1f6718 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -879,6 +879,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
FeatureFlags *const features = &cm->features;
+ const int is_highbitdepth = seq_params->use_highbitdepth;
// in case of LAP, lag in frames is set according to number of lap buffers
// calculated at init time. This stores and restores LAP's lag in frames to
@@ -976,6 +977,14 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
}
}
+ if (x->upsample_pred == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->upsample_pred,
+ aom_memalign(16, (1 + is_highbitdepth) * ((MAX_SB_SIZE + 16) + 16) *
+ MAX_SB_SIZE * sizeof(*x->upsample_pred)));
+ x->e_mbd.tmp_upsample_pred = x->upsample_pred;
+ }
+
av1_reset_segment_features(cm);
av1_set_high_precision_mv(cpi, 1, 0);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 90f36b73a4..52bb9754c0 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1489,6 +1489,7 @@ typedef struct ThreadData {
CONV_BUF_TYPE *tmp_conv_dst;
uint64_t abs_sum_level;
uint8_t *tmp_pred_bufs[2];
+ uint8_t *upsample_pred;
uint8_t *wiener_tmp_pred_buf;
int intrabc_used;
int deltaq_used;
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 52b2a37606..33caa2ac9d 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -336,6 +336,7 @@ static inline void dealloc_compressor_data(AV1_COMP *cpi) {
aom_free(cpi->td.mb.palette_buffer);
release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
aom_free(cpi->td.mb.tmp_conv_dst);
+ aom_free(cpi->td.mb.upsample_pred);
for (int j = 0; j < 2; ++j) {
aom_free(cpi->td.mb.tmp_pred_bufs[j]);
}
@@ -478,6 +479,7 @@ static inline void free_thread_data(AV1_PRIMARY *ppi) {
aom_free(td->tctx);
aom_free(td->palette_buffer);
aom_free(td->tmp_conv_dst);
+ aom_free(td->upsample_pred);
release_compound_type_rd_buffers(&td->comp_rd_buffer);
for (int j = 0; j < 2; ++j) {
aom_free(td->tmp_pred_bufs[j]);
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index fad4f5119e..c10c662817 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -960,6 +960,7 @@ void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
assert(p_mt_info->workers != NULL);
assert(p_mt_info->tile_thr_data != NULL);
+ const int is_highbitdepth = ppi->seq_params.use_highbitdepth;
int num_workers = p_mt_info->num_workers;
int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC);
assert(num_enc_workers <= num_workers);
@@ -989,6 +990,11 @@ void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
"Failed to allocate PICK_MODE_CONTEXT");
}
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->upsample_pred,
+ aom_memalign(16, (1 + is_highbitdepth) * ((MAX_SB_SIZE + 16) + 16) *
+ MAX_SB_SIZE * sizeof(*td->upsample_pred)));
+
if (!is_first_pass && i < num_enc_workers) {
// Set up sms_tree.
if (av1_setup_sms_tree(ppi->cpi, td)) {
@@ -1630,6 +1636,7 @@ static inline void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ thread_data->td->mb.upsample_pred = thread_data->td->upsample_pred;
for (int j = 0; j < 2; ++j) {
thread_data->td->mb.tmp_pred_bufs[j] =
thread_data->td->tmp_pred_bufs[j];
@@ -1641,6 +1648,8 @@ static inline void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
thread_data->td->src_var_info_of_4x4_sub_blocks;
thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ thread_data->td->mb.e_mbd.tmp_upsample_pred =
+ thread_data->td->mb.upsample_pred;
for (int j = 0; j < 2; ++j) {
thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
thread_data->td->mb.tmp_pred_bufs[j];
@@ -2290,6 +2299,9 @@ static inline void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
}
thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ thread_data->td->mb.upsample_pred = thread_data->td->upsample_pred;
+ thread_data->td->mb.e_mbd.tmp_upsample_pred =
+ thread_data->td->mb.upsample_pred;
}
}
}
@@ -2466,6 +2478,9 @@ static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
"Error allocating temporal filter data");
}
+ thread_data->td->mb.upsample_pred = thread_data->td->upsample_pred;
+ thread_data->td->mb.e_mbd.tmp_upsample_pred =
+ thread_data->td->mb.upsample_pred;
}
}
}
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 06145ce1bc..b451a21ffe 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -2572,7 +2572,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
unsigned int besterr;
#if CONFIG_AV1_HIGHBITDEPTH
if (is_cur_buf_hbd(xd)) {
- DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ uint16_t *pred16 = (uint16_t *)(xd->tmp_upsample_pred);
uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
if (second_pred != NULL) {
if (mask) {
@@ -2593,7 +2593,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
}
besterr = vfp->vf(pred8, w, src, src_stride, sse);
} else {
- DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ uint8_t *pred = xd->tmp_upsample_pred;
if (second_pred != NULL) {
if (mask) {
aom_comp_mask_upsampled_pred(
@@ -2614,7 +2614,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
besterr = vfp->vf(pred, w, src, src_stride, sse);
}
#else
- DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ uint8_t *pred = xd->tmp_upsample_pred;
if (second_pred != NULL) {
if (mask) {
aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
@@ -3682,21 +3682,23 @@ static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
const int mi_col = xd->mi_col;
unsigned int besterr;
- DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
#if CONFIG_AV1_HIGHBITDEPTH
if (is_cur_buf_hbd(xd)) {
- uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+ uint16_t *pred16 = (uint16_t *)(xd->tmp_upsample_pred);
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
subpel_search_type);
besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
} else {
+ uint8_t *pred = xd->tmp_upsample_pred;
aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
subpel_y_q3, ref, ref_stride, subpel_search_type);
besterr = vfp->ovf(pred, w, wsrc, mask, sse);
}
#else
+ uint8_t *pred = xd->tmp_upsample_pred;
aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
subpel_y_q3, ref, ref_stride, subpel_search_type);
diff --git a/av1/encoder/x86/reconinter_enc_sse2.c b/av1/encoder/x86/reconinter_enc_sse2.c
index a18e172b99..8ab566a719 100644
--- a/av1/encoder/x86/reconinter_enc_sse2.c
+++ b/av1/encoder/x86/reconinter_enc_sse2.c
@@ -126,8 +126,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
width, height);
} else {
- DECLARE_ALIGNED(16, uint8_t,
- temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+ uint8_t *temp = comp_pred;
const int16_t *const kernel_x =
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
const int16_t *const kernel_y =
@@ -230,8 +229,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
kernel, 16, width, height, bd);
} else {
- DECLARE_ALIGNED(16, uint16_t,
- temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+ uint16_t *temp = CONVERT_TO_SHORTPTR(comp_pred8);
const int16_t *const kernel_x =
av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
const int16_t *const kernel_y =
diff --git a/test/comp_mask_pred_test.cc b/test/comp_mask_pred_test.cc
index 7a23398d73..f57160ce0f 100644
--- a/test/comp_mask_pred_test.cc
+++ b/test/comp_mask_pred_test.cc
@@ -81,9 +81,11 @@ AV1CompMaskPredBase::~AV1CompMaskPredBase() = default;
void AV1CompMaskPredBase::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
av1_init_wedge_masks();
- comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ comp_pred1_ =
+ (uint8_t *)aom_memalign(16, ((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE);
ASSERT_NE(comp_pred1_, nullptr);
- comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+ comp_pred2_ =
+ (uint8_t *)aom_memalign(16, ((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE);
ASSERT_NE(comp_pred2_, nullptr);
pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
ASSERT_NE(pred_, nullptr);
@@ -455,11 +457,11 @@ void AV1HighbdCompMaskPredTestBase::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
av1_init_wedge_masks();
- comp_pred1_ =
- (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+ comp_pred1_ = (uint16_t *)aom_memalign(
+ 16, ((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE * sizeof(*comp_pred1_));
ASSERT_NE(comp_pred1_, nullptr);
- comp_pred2_ =
- (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+ comp_pred2_ = (uint16_t *)aom_memalign(
+ 16, ((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE * sizeof(*comp_pred2_));
ASSERT_NE(comp_pred2_, nullptr);
pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
ASSERT_NE(pred_, nullptr);