Commit 76fe151b23 for aom
commit 76fe151b233dbc889dfd882a7d765de0d5dcc543
Author: Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
Date: Tue Mar 24 17:57:34 2026 +0530
Prune the evaluation of inter transform split
In the CL, top N estimated RD Costs of transform no-split are tracked
for inter modes and transform split RD evaluation of a inter mode is
skipped if the relevant no-split RD Cost is worser than top Nth
no-split RD Cost.
The encoder performance results averaged over all resolutions are
as below:
Instruction Count BD-Rate Loss(%)
cpu Reduction(%) avg.psnr ovr.psnr ssim vmaf vmaf_neg
1 1.647 0.0253 0.0305 0.0261 0.0594 0.0524
2 1.144 0.0121 0.0081 -0.0006 0.0096 0.0342
The CL is bit-exact for speed>=3
STATS_CHANGED
Change-Id: I7f477093a84b27d19ac2d6be41e4aeeab26a8fb7
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index f36f8cc6a6..bd352157a8 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -43,6 +43,9 @@ extern "C" {
/*! Maximum value taken by transform type probabilities */
#define MAX_TX_TYPE_PROB 1024
+/*! Maximum value of inter transform RD records. */
+#define TOP_INTER_TX_NO_SPLIT_COUNT 4
+
//! Compute color sensitivity index for given plane
#define COLOR_SENS_IDX(plane) ((plane) - 1)
@@ -1412,6 +1415,10 @@ typedef struct macroblock {
*/
int palette_pixels;
+ /*! \brief Keep records of top no-split RD Costs of transform size search. */
+ int64_t top_inter_tx_no_split_rd[MAX_TX_BLOCKS_IN_MAX_SB]
+ [TOP_INTER_TX_NO_SPLIT_COUNT];
+
/*!\brief Pointer to the structure which stores the statistics used by
* sb-level multi-pass encoding.
*/
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index dd9d72df41..b331cf36c1 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -5925,6 +5925,18 @@ static inline void search_intra_modes_in_interframe(
}
}
+// Initialize the table that stores best RD Costs of transform no-split.
+static inline void init_top_tx_no_split_rd_for_inter_modes(
+ MACROBLOCK *x, int prune_inter_tx_split_rd_eval_lvl) {
+ if (!prune_inter_tx_split_rd_eval_lvl) return;
+
+ for (int i = 0; i < MAX_TX_BLOCKS_IN_MAX_SB; i++) {
+ for (int j = 0; j < TOP_INTER_TX_NO_SPLIT_COUNT; j++) {
+ x->top_inter_tx_no_split_rd[i][j] = INT64_MAX;
+ }
+ }
+}
+
#if !CONFIG_REALTIME_ONLY
// Prepare inter_cost and intra_cost from TPL stats, which are used as ML
// features in intra mode pruning.
@@ -6096,6 +6108,10 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
};
+
+ init_top_tx_no_split_rd_for_inter_modes(
+ x, sf->tx_sf.prune_inter_tx_split_rd_eval_lvl);
+
HandleInterModeArgs args = { { NULL },
{ MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
{ NULL },
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index b7bbb36065..ebc452cd84 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1190,6 +1190,7 @@ static void set_good_speed_features_framesize_independent(
sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
sf->tx_sf.tx_type_search.skip_tx_search = 1;
+ sf->tx_sf.prune_inter_tx_split_rd_eval_lvl = 1;
sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3;
sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
@@ -1256,6 +1257,8 @@ static void set_good_speed_features_framesize_independent(
// TODO(any): Re-evaluate this feature set to 1 in speed 2.
sf->tpl_sf.allow_compound_pred = 0;
sf->tpl_sf.prune_ref_frames_in_tpl = 1;
+
+ sf->tx_sf.prune_inter_tx_split_rd_eval_lvl = 2;
}
if (speed >= 3) {
@@ -2441,6 +2444,7 @@ static inline void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
tx_sf->prune_tx_size_level = 0;
tx_sf->prune_intra_tx_depths_using_nn = false;
tx_sf->use_rd_based_breakout_for_intra_tx_search = false;
+ tx_sf->prune_inter_tx_split_rd_eval_lvl = 0;
}
static inline void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 8e89031b5f..c3c3a18ad8 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1464,6 +1464,12 @@ typedef struct TX_SPEED_FEATURES {
// for speed 3, 4, 5, 6, 7 and 8 on a typical image dataset with coding
// performance change less than 0.004%.
bool use_rd_based_breakout_for_intra_tx_search;
+
+ // Prune RD evaluation of transform split using RD Costs of transform no-split
+ // of inter modes that are evaluated so far.
+ // Values are 0 (not used), 1 - 2 with progressively increasing
+ // aggressiveness, i.e., decreasing number of top candidates
+ int prune_inter_tx_split_rd_eval_lvl;
} TX_SPEED_FEATURES;
typedef struct RD_CALC_SPEED_FEATURES {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 1829a25723..2058147fed 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -79,6 +79,10 @@ static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4, 8, 16, 32, 32, 6, 6,
12, 12, 23, 23, 32, 32, 8,
8, 16, 16, 23, 23 };
+// look-up table for number of top no-split RD Costs that should be considered
+// based on prune_inter_tx_split_rd_eval_lvl speed feature.
+static const int num_inter_tx_no_split_cand[2] = { 4, 3 };
+
static inline uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
const int rows = block_size_high[bsize];
const int cols = block_size_wide[bsize];
@@ -300,6 +304,60 @@ static inline void save_mb_rd_info(int n4, uint32_t hash,
mb_rd_info->rd_stats = *rd_stats;
}
+// Store the RD Cost of transform no-split.
+static inline void push_inter_block_tx_no_split_rd(
+ MACROBLOCK *x, const MB_MODE_INFO *mbmi, int64_t tmp_rd, int blk_idx,
+ int prune_inter_tx_split_rd_eval_lvl) {
+ assert(blk_idx < MAX_TX_BLOCKS_IN_MAX_SB);
+ if (!prune_inter_tx_split_rd_eval_lvl) return;
+
+ if (blk_idx == -1 || tmp_rd == INT64_MAX) return;
+
+ // Do not store for skip and intraBC modes
+ if (mbmi->skip_mode != 0 || is_intrabc_block(mbmi)) return;
+
+ int num_top_cand =
+ num_inter_tx_no_split_cand[prune_inter_tx_split_rd_eval_lvl - 1];
+ assert(num_top_cand <= TOP_INTER_TX_NO_SPLIT_COUNT);
+
+ // Insert the RD Cost in sorted order
+ for (int i = 0; i < num_top_cand; i++) {
+ if (tmp_rd < x->top_inter_tx_no_split_rd[blk_idx][i]) {
+ for (int j = num_top_cand - 1; j > i; j--) {
+ x->top_inter_tx_no_split_rd[blk_idx][j] =
+ x->top_inter_tx_no_split_rd[blk_idx][j - 1];
+ }
+ x->top_inter_tx_no_split_rd[blk_idx][i] = tmp_rd;
+ break;
+ }
+ }
+}
+
+// Prune the evaluation of transform split.
+static inline bool prune_tx_split_eval_using_no_split_rd(
+ const MACROBLOCK *x, const MB_MODE_INFO *mbmi, int64_t tmp_rd, int blk_idx,
+ int prune_inter_tx_split_rd_eval_lvl) {
+ if (!prune_inter_tx_split_rd_eval_lvl) return false;
+
+ if (blk_idx == -1 || tmp_rd == INT64_MAX) return false;
+
+ // Do not prune for skip and intraBC modes
+ if (mbmi->skip_mode != 0 || is_intrabc_block(mbmi)) return false;
+
+ int num_top_cand =
+ num_inter_tx_no_split_cand[prune_inter_tx_split_rd_eval_lvl - 1];
+ assert(num_top_cand <= TOP_INTER_TX_NO_SPLIT_COUNT);
+
+ // Do not prune if there is no valid top RD Cost for comparison
+ if (x->top_inter_tx_no_split_rd[blk_idx][num_top_cand - 1] == INT64_MAX)
+ return false;
+
+ if (tmp_rd > x->top_inter_tx_no_split_rd[blk_idx][num_top_cand - 1])
+ return true;
+
+ return false;
+}
+
static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
const SPEED_FEATURES *sf,
int tx_size_search_method) {
@@ -327,7 +385,7 @@ static inline void select_tx_block(
TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
- int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
+ int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode, int blk_idx);
// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
// 0: Do not collect any RD stats
@@ -2427,7 +2485,7 @@ static inline void try_tx_block_split(
select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1,
plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost,
- &this_cost_valid, ftxs_mode);
+ &this_cost_valid, ftxs_mode, -1);
if (!this_cost_valid) {
split_rd_stats->rdcost = INT64_MAX;
return;
@@ -2542,7 +2600,7 @@ static inline void select_tx_block(
TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
- int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
+ int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode, int blk_idx) {
assert(tx_size < TX_SIZES_ALL);
av1_init_rd_stats(rd_stats);
if (ref_best_rd < 0) {
@@ -2583,6 +2641,10 @@ static inline void select_tx_block(
plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
ftxs_mode, &no_split);
+ push_inter_block_tx_no_split_rd(
+ x, mbmi, no_split.rd, blk_idx,
+ cpi->sf.tx_sf.prune_inter_tx_split_rd_eval_lvl);
+
// Speed features for early termination.
const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
if (search_level) {
@@ -2597,6 +2659,11 @@ static inline void select_tx_block(
if (cpi->sf.tx_sf.txb_split_cap) {
if (p->eobs[block] == 0) try_split = 0;
}
+ if (prune_tx_split_eval_using_no_split_rd(
+ x, mbmi, no_split.rd, blk_idx,
+ cpi->sf.tx_sf.prune_inter_tx_split_rd_eval_lvl)) {
+ try_split = 0;
+ }
}
// ML based speed feature to skip searching for split transform blocks.
@@ -3402,6 +3469,7 @@ static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0);
int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0);
int block = 0;
+ int blk_idx = 0;
av1_init_rd_stats(rd_stats);
for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) {
@@ -3415,7 +3483,8 @@ static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
// Search for the best transform block size and type for the sub-block.
select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
- best_rd_sofar, &is_cost_valid, ftxs_mode);
+ best_rd_sofar, &is_cost_valid, ftxs_mode, blk_idx);
+ blk_idx++;
if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
av1_invalid_rd_stats(rd_stats);
return INT64_MAX;