Commit 76fe151b23 for aom

commit 76fe151b233dbc889dfd882a7d765de0d5dcc543
Author: Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
Date:   Tue Mar 24 17:57:34 2026 +0530

    Prune the evaluation of inter transform split

    In the CL, top N estimated RD Costs of transform no-split are tracked
    for inter modes and transform split RD evaluation of a inter mode is
    skipped if the relevant no-split RD Cost is worser than top Nth
    no-split RD Cost.

    The encoder performance results averaged over all resolutions are
    as below:

        Instruction Count                 BD-Rate Loss(%)
    cpu   Reduction(%)     avg.psnr   ovr.psnr   ssim     vmaf   vmaf_neg
     1       1.647          0.0253     0.0305   0.0261   0.0594   0.0524
     2       1.144          0.0121     0.0081  -0.0006   0.0096   0.0342

    The CL is bit-exact for speed>=3

    STATS_CHANGED

    Change-Id: I7f477093a84b27d19ac2d6be41e4aeeab26a8fb7

diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index f36f8cc6a6..bd352157a8 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -43,6 +43,9 @@ extern "C" {
 /*! Maximum value taken by transform type probabilities */
 #define MAX_TX_TYPE_PROB 1024

+/*! Maximum value of inter transform RD records. */
+#define TOP_INTER_TX_NO_SPLIT_COUNT 4
+
 //! Compute color sensitivity index for given plane
 #define COLOR_SENS_IDX(plane) ((plane) - 1)

@@ -1412,6 +1415,10 @@ typedef struct macroblock {
    */
   int palette_pixels;

+  /*! \brief Keep records of top no-split RD Costs of transform size search. */
+  int64_t top_inter_tx_no_split_rd[MAX_TX_BLOCKS_IN_MAX_SB]
+                                  [TOP_INTER_TX_NO_SPLIT_COUNT];
+
   /*!\brief Pointer to the structure which stores the statistics used by
    * sb-level multi-pass encoding.
    */
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index dd9d72df41..b331cf36c1 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -5925,6 +5925,18 @@ static inline void search_intra_modes_in_interframe(
   }
 }

+// Initialize the table that stores best RD Costs of transform no-split.
+static inline void init_top_tx_no_split_rd_for_inter_modes(
+    MACROBLOCK *x, int prune_inter_tx_split_rd_eval_lvl) {
+  if (!prune_inter_tx_split_rd_eval_lvl) return;
+
+  for (int i = 0; i < MAX_TX_BLOCKS_IN_MAX_SB; i++) {
+    for (int j = 0; j < TOP_INTER_TX_NO_SPLIT_COUNT; j++) {
+      x->top_inter_tx_no_split_rd[i][j] = INT64_MAX;
+    }
+  }
+}
+
 #if !CONFIG_REALTIME_ONLY
 // Prepare inter_cost and intra_cost from TPL stats, which are used as ML
 // features in intra mode pruning.
@@ -6096,6 +6108,10 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
     INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
     INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
   };
+
+  init_top_tx_no_split_rd_for_inter_modes(
+      x, sf->tx_sf.prune_inter_tx_split_rd_eval_lvl);
+
   HandleInterModeArgs args = { { NULL },
                                { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
                                { NULL },
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index b7bbb36065..ebc452cd84 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1190,6 +1190,7 @@ static void set_good_speed_features_framesize_independent(
     sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
     sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
     sf->tx_sf.tx_type_search.skip_tx_search = 1;
+    sf->tx_sf.prune_inter_tx_split_rd_eval_lvl = 1;

     sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3;
     sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
@@ -1256,6 +1257,8 @@ static void set_good_speed_features_framesize_independent(
     // TODO(any): Re-evaluate this feature set to 1 in speed 2.
     sf->tpl_sf.allow_compound_pred = 0;
     sf->tpl_sf.prune_ref_frames_in_tpl = 1;
+
+    sf->tx_sf.prune_inter_tx_split_rd_eval_lvl = 2;
   }

   if (speed >= 3) {
@@ -2441,6 +2444,7 @@ static inline void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
   tx_sf->prune_tx_size_level = 0;
   tx_sf->prune_intra_tx_depths_using_nn = false;
   tx_sf->use_rd_based_breakout_for_intra_tx_search = false;
+  tx_sf->prune_inter_tx_split_rd_eval_lvl = 0;
 }

 static inline void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 8e89031b5f..c3c3a18ad8 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1464,6 +1464,12 @@ typedef struct TX_SPEED_FEATURES {
   // for speed 3, 4, 5, 6, 7 and 8 on a typical image dataset with coding
   // performance change less than 0.004%.
   bool use_rd_based_breakout_for_intra_tx_search;
+
+  // Prune RD evaluation of transform split using RD Costs of transform no-split
+  // of inter modes that are evaluated so far.
+  // Values are 0 (not used),  1 - 2 with progressively increasing
+  // aggressiveness, i.e., decreasing number of top candidates
+  int prune_inter_tx_split_rd_eval_lvl;
 } TX_SPEED_FEATURES;

 typedef struct RD_CALC_SPEED_FEATURES {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 1829a25723..2058147fed 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -79,6 +79,10 @@ static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4,  8,  16, 32, 32, 6,  6,
                                                      12, 12, 23, 23, 32, 32, 8,
                                                      8,  16, 16, 23, 23 };

+// look-up table for number of top no-split RD Costs that should be considered
+// based on prune_inter_tx_split_rd_eval_lvl speed feature.
+static const int num_inter_tx_no_split_cand[2] = { 4, 3 };
+
 static inline uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
@@ -300,6 +304,60 @@ static inline void save_mb_rd_info(int n4, uint32_t hash,
   mb_rd_info->rd_stats = *rd_stats;
 }

+// Store the RD Cost of transform no-split.
+static inline void push_inter_block_tx_no_split_rd(
+    MACROBLOCK *x, const MB_MODE_INFO *mbmi, int64_t tmp_rd, int blk_idx,
+    int prune_inter_tx_split_rd_eval_lvl) {
+  assert(blk_idx < MAX_TX_BLOCKS_IN_MAX_SB);
+  if (!prune_inter_tx_split_rd_eval_lvl) return;
+
+  if (blk_idx == -1 || tmp_rd == INT64_MAX) return;
+
+  // Do not store for skip and intraBC modes
+  if (mbmi->skip_mode != 0 || is_intrabc_block(mbmi)) return;
+
+  int num_top_cand =
+      num_inter_tx_no_split_cand[prune_inter_tx_split_rd_eval_lvl - 1];
+  assert(num_top_cand <= TOP_INTER_TX_NO_SPLIT_COUNT);
+
+  // Insert the RD Cost in sorted order
+  for (int i = 0; i < num_top_cand; i++) {
+    if (tmp_rd < x->top_inter_tx_no_split_rd[blk_idx][i]) {
+      for (int j = num_top_cand - 1; j > i; j--) {
+        x->top_inter_tx_no_split_rd[blk_idx][j] =
+            x->top_inter_tx_no_split_rd[blk_idx][j - 1];
+      }
+      x->top_inter_tx_no_split_rd[blk_idx][i] = tmp_rd;
+      break;
+    }
+  }
+}
+
+// Prune the evaluation of transform split.
+static inline bool prune_tx_split_eval_using_no_split_rd(
+    const MACROBLOCK *x, const MB_MODE_INFO *mbmi, int64_t tmp_rd, int blk_idx,
+    int prune_inter_tx_split_rd_eval_lvl) {
+  if (!prune_inter_tx_split_rd_eval_lvl) return false;
+
+  if (blk_idx == -1 || tmp_rd == INT64_MAX) return false;
+
+  // Do not prune for skip and intraBC modes
+  if (mbmi->skip_mode != 0 || is_intrabc_block(mbmi)) return false;
+
+  int num_top_cand =
+      num_inter_tx_no_split_cand[prune_inter_tx_split_rd_eval_lvl - 1];
+  assert(num_top_cand <= TOP_INTER_TX_NO_SPLIT_COUNT);
+
+  // Do not prune if there is no valid top RD Cost for comparison
+  if (x->top_inter_tx_no_split_rd[blk_idx][num_top_cand - 1] == INT64_MAX)
+    return false;
+
+  if (tmp_rd > x->top_inter_tx_no_split_rd[blk_idx][num_top_cand - 1])
+    return true;
+
+  return false;
+}
+
 static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
                                  const SPEED_FEATURES *sf,
                                  int tx_size_search_method) {
@@ -327,7 +385,7 @@ static inline void select_tx_block(
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
-    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode, int blk_idx);

 // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
 // 0: Do not collect any RD stats
@@ -2427,7 +2485,7 @@ static inline void try_tx_block_split(
       select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1,
                       plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
                       no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost,
-                      &this_cost_valid, ftxs_mode);
+                      &this_cost_valid, ftxs_mode, -1);
       if (!this_cost_valid) {
         split_rd_stats->rdcost = INT64_MAX;
         return;
@@ -2542,7 +2600,7 @@ static inline void select_tx_block(
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
-    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode, int blk_idx) {
   assert(tx_size < TX_SIZES_ALL);
   av1_init_rd_stats(rd_stats);
   if (ref_best_rd < 0) {
@@ -2583,6 +2641,10 @@ static inline void select_tx_block(
                           plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
                           ftxs_mode, &no_split);

+    push_inter_block_tx_no_split_rd(
+        x, mbmi, no_split.rd, blk_idx,
+        cpi->sf.tx_sf.prune_inter_tx_split_rd_eval_lvl);
+
     // Speed features for early termination.
     const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
     if (search_level) {
@@ -2597,6 +2659,11 @@ static inline void select_tx_block(
     if (cpi->sf.tx_sf.txb_split_cap) {
       if (p->eobs[block] == 0) try_split = 0;
     }
+    if (prune_tx_split_eval_using_no_split_rd(
+            x, mbmi, no_split.rd, blk_idx,
+            cpi->sf.tx_sf.prune_inter_tx_split_rd_eval_lvl)) {
+      try_split = 0;
+    }
   }

   // ML based speed feature to skip searching for split transform blocks.
@@ -3402,6 +3469,7 @@ static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
   int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0);
   int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0);
   int block = 0;
+  int blk_idx = 0;

   av1_init_rd_stats(rd_stats);
   for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) {
@@ -3415,7 +3483,8 @@ static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
       // Search for the best transform block size and type for the sub-block.
       select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
                       ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
-                      best_rd_sofar, &is_cost_valid, ftxs_mode);
+                      best_rd_sofar, &is_cost_valid, ftxs_mode, blk_idx);
+      blk_idx++;
       if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
         av1_invalid_rd_stats(rd_stats);
         return INT64_MAX;