Commit c9b95efd26 for aom

commit c9b95efd26ee2b9937c9d8569dd0e6dcd8b4ae66
Author: Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
Date:   Thu Apr 23 13:42:02 2026 +0530

    Prune either h or v 1:4 partitions

    In this CL, simple motion search is done to calculate SSE for
    each of the 4 partition blocks of PARTITION_HORZ_4 and
    PARTITION_VERT_4. Skip RD cost is computed using above to
    prune either of the partition structures.

         Instruction Count                   BD-Rate Loss(%)
    cpu    Reduction(%)    avg.psnr  ovr.psnr   ssim     vmaf    vmaf_neg
     3       1.684          0.0581    0.0611   0.0549   0.0890    0.0525
     4       1.073          0.0438    0.0513   0.0519   0.0633    0.0598
     5       0.464          0.0232    0.0242   0.0285   0.0463    0.0434

    This CL is bit-exact for speed=6.

    STATS_CHANGED

    Change-Id: Ib03a7bf9dc5ffe274836683b657734731494b942

diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 19d42b9e1c..9455b4bc51 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -4048,12 +4048,80 @@ static void prune_4_partition_using_split_info(
   }
 }

+static void prune_part4_using_sms(AV1_COMP *const cpi, MACROBLOCK *x,
+                                  const PartitionSearchState *part_search_state,
+                                  SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row,
+                                  int mi_col, BLOCK_SIZE bsize,
+                                  int *part4_search_allowed) {
+  if (!part4_search_allowed[HORZ4] || !part4_search_allowed[VERT4]) return;
+
+  unsigned int sms_h_part4_sse[4];
+  unsigned int sms_v_part4_sse[4];
+
+  const BLOCK_SIZE subsize_h4 = get_partition_subsize(bsize, PARTITION_HORZ_4);
+  const BLOCK_SIZE subsize_v4 = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+  const int h_mi = mi_size_high[bsize];
+  const int w_mi = mi_size_wide[bsize];
+
+  const int ref = cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+  if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref])) return;
+
+  int64_t part4_h_sse_sum = 0;
+  int64_t part4_v_sse_sum = 0;
+
+  // ---- HORZ_4 ----
+  for (int r_idx = 0; r_idx < SUB_PARTITIONS_PART4; r_idx++) {
+    unsigned int part_var;
+    const int sub_mi_row = mi_row + r_idx * h_mi / 4;
+    const int sub_mi_col = mi_col;
+
+    av1_simple_motion_search_sse_var(cpi, x, sub_mi_row, sub_mi_col, subsize_h4,
+                                     ref, sms_tree->start_mvs[ref],
+                                     /*num_planes=*/1, /*use_subpixel=*/1,
+                                     &sms_h_part4_sse[r_idx], &part_var);
+
+    (void)part_var;
+
+    part4_h_sse_sum += sms_h_part4_sse[r_idx];
+  }
+
+  // ---- VERT_4 ----
+  for (int r_idx = 0; r_idx < SUB_PARTITIONS_PART4; r_idx++) {
+    unsigned int part_var;
+    const int sub_mi_row = mi_row;
+    const int sub_mi_col = mi_col + r_idx * w_mi / 4;
+
+    av1_simple_motion_search_sse_var(cpi, x, sub_mi_row, sub_mi_col, subsize_v4,
+                                     ref, sms_tree->start_mvs[ref],
+                                     /*num_planes=*/1, /*use_subpixel=*/1,
+                                     &sms_v_part4_sse[r_idx], &part_var);
+    (void)part_var;
+
+    part4_v_sse_sum += sms_v_part4_sse[r_idx];
+  }
+
+  // ---- Skip RD calculation ----
+  const int64_t part4_h_rd =
+      RDCOST(x->rdmult, part_search_state->partition_cost[PARTITION_HORZ_4],
+             part4_h_sse_sum);
+
+  const int64_t part4_v_rd =
+      RDCOST(x->rdmult, part_search_state->partition_cost[PARTITION_VERT_4],
+             part4_v_sse_sum);
+
+  // ---- pruning ----
+  if (part4_h_rd > part4_v_rd) part4_search_allowed[HORZ4] = 0;
+
+  if (part4_v_rd > part4_h_rd) part4_search_allowed[VERT4] = 0;
+}
+
 // Prune 4-way partition search.
 static void prune_4_way_partition_search(
     AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree,
-    PartitionSearchState *part_search_state, RD_STATS *best_rdc,
-    int pb_source_variance, int prune_ext_part_state,
-    int part4_search_allowed[NUM_PART4_TYPES]) {
+    PartitionSearchState *part_search_state, SIMPLE_MOTION_DATA_TREE *sms_tree,
+    RD_STATS *best_rdc, int pb_source_variance, int prune_ext_part_state,
+    int mi_row, int mi_col, int part4_search_allowed[NUM_PART4_TYPES]) {
   const PartitionBlkParams blk_params = part_search_state->part_blk_params;
   const BLOCK_SIZE bsize = blk_params.bsize;

@@ -4132,6 +4200,15 @@ static void prune_4_way_partition_search(
   // in the current block and sub-blocks in PARTITION_SPLIT.
   prune_4_partition_using_split_info(cpi, x, part_search_state,
                                      part4_search_allowed);
+
+  if (cpi->sf.part_sf.prune_h_or_v_4part_using_sms_info && partition4_allowed &&
+      best_rdc->rdcost != INT64_MAX &&
+      av1_is_whole_blk_in_frame(&part_search_state->part_blk_params,
+                                &cpi->common.mi_params) &&
+      !frame_is_intra_only(&cpi->common)) {
+    prune_part4_using_sms(cpi, x, part_search_state, sms_tree, mi_row, mi_col,
+                          bsize, part4_search_allowed);
+  }
 }

 // Set params needed for PARTITION_NONE search.
@@ -5834,8 +5911,9 @@ BEGIN_PARTITION_SEARCH:
   // 4-way partitions search stage.
   int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
   // Prune 4-way partition search.
-  prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc,
-                               pb_source_variance, prune_ext_part_state,
+  prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, sms_tree,
+                               &best_rdc, pb_source_variance,
+                               prune_ext_part_state, mi_row, mi_col,
                                part4_search_allowed);

 #if CONFIG_COLLECT_COMPONENT_TIMING
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index abfa1a53f3..8888da6571 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1280,6 +1280,7 @@ static void set_good_speed_features_framesize_independent(
             : (boosted ? SIMPLE_AGG_LVL3 : QIDX_BASED_AGG_LVL1);
     sf->part_sf.prune_ext_part_using_split_info = 1;
     sf->part_sf.simple_motion_search_rect_split = 1;
+    sf->part_sf.prune_h_or_v_4part_using_sms_info = true;

     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->mv_sf.search_method = DIAMOND;
@@ -2311,6 +2312,7 @@ static inline void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
   part_sf->skip_non_sq_part_based_on_none = 0;
   part_sf->disable_8x8_part_based_on_qidx = 0;
   part_sf->split_partition_penalty_level = 0;
+  part_sf->prune_h_or_v_4part_using_sms_info = false;
 }

 static inline void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 5b1a2e8e1e..7c756295a1 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -855,6 +855,10 @@ typedef struct PARTITION_SPEED_FEATURES {
   // Disables 8x8 and below partitions for low quantizers.
   int disable_8x8_part_based_on_qidx;

+  // Disables either of PARTITION_HORZ_4 or PARTITION_VERT_4 using SSE from
+  // simple motion search.
+  bool prune_h_or_v_4part_using_sms_info;
+
   // Decoder side speed feature to add penalty for use of smaller partitions.
   // Takes values 0 - 2, 0 indicating no penalty and higher level indicating
   // increased penalty.