Commit c47960f881 for aom

commit c47960f88149f23f0a07c24cee64ca7c3cf3ab04
Author: Yunqing Wang <yunqingwang@google.com>
Date:   Wed Mar 4 15:34:33 2026 -0800

    Optimization in apply_temporal_filter function

    Added the optimization and got back marjority of the encoder speed loss
    due to the introduction of 64x64 TF block. This got back ~1.7% encoder
    time.

    Change-Id: I8b0499b52753f4c31cb44060971e3b78ed925dbb

diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 58dbf70090..3840d26de7 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -46,6 +46,9 @@

 // NOTE: All `tf` in this file means `temporal filtering`.

+// Number of 16x16 blocks within one 64x64 TF block.
+#define NUM_16X16 16
+
 // Forward Declaration.
 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
                                          const MV *midblock_mvs,
@@ -433,7 +436,7 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
                                  midblock_mses, subblock_mvs, subblock_mses);
   } else {
     // Copy 64X64 block mv and mse values to sub blocks
-    for (int i = 0; i < 16; ++i) {
+    for (int i = 0; i < NUM_16X16; ++i) {
       subblock_mvs[i] = block_mv;
       subblock_mses[i] = block_mse;
     }
@@ -498,7 +501,7 @@ static void tf_determine_block_partition(const MV block_mv, const int block_mse,
   min_subblock_mse = INT_MAX;
   max_subblock_mse = INT_MIN;
   sum_subblock_mse = 0;
-  for (i = 0; i < 16; ++i) {
+  for (i = 0; i < NUM_16X16; ++i) {
     sum_subblock_mse += subblock_mses[i];
     min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
     max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
@@ -509,7 +512,7 @@ static void tf_determine_block_partition(const MV block_mv, const int block_mse,
       ((block_mse * 14 <= sum_subblock_mse) &&
        (max_subblock_mse - min_subblock_mse) * 8 <
            sum_subblock_mse)) {  // No split.
-    for (i = 0; i < 16; ++i) {
+    for (i = 0; i < NUM_16X16; ++i) {
       subblock_mvs[i] = block_mv;
       subblock_mses[i] = block_mse;
     }
@@ -835,6 +838,18 @@ void av1_apply_temporal_filter_c(
     decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
   }

+  // Figure out each 16x16 block's d_factor beforehand.
+  double d_factor[NUM_16X16] = { 0 };
+  for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
+    // Larger motion vector -> smaller filtering weight.
+    const MV mv = subblock_mvs[subblock_idx];
+    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+    distance_threshold = AOMMAX(distance_threshold, 1);
+    d_factor[subblock_idx] = distance / distance_threshold;
+    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+  }
+
   // Allocate memory for pixel-wise squared differences. They,
   // regardless of the subsampling, are assigned with memory of size `mb_pels`.
   uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
@@ -920,15 +935,9 @@ void av1_apply_temporal_filter_c(
         const double combined_error =
             weight_factor * window_error + block_error * inv_factor;

-        // Larger motion vector -> smaller filtering weight.
-        const MV mv = subblock_mvs[subblock_idx];
-        const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-        const double distance_threshold =
-            (double)AOMMAX(min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD, 1);
-        const double d_factor = AOMMAX(distance / distance_threshold, 1);
-
         // Compute filter weight.
-        double scaled_error = combined_error * d_factor * decay_factor[plane];
+        double scaled_error =
+            combined_error * d_factor[subblock_idx] * decay_factor[plane];
         scaled_error = AOMMIN(scaled_error, 7);
         int weight;
         if (tf_wgt_calc_lvl == 0) {
@@ -1105,14 +1114,14 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
       // Motion search.
       // block size is 64x64. 16 16x16 in 1 64x64.
       // Store motion search results in 16x16 units.
-      MV subblock_mvs[16] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv,
-                              kZeroMv, kZeroMv, kZeroMv, kZeroMv,
-                              kZeroMv, kZeroMv, kZeroMv, kZeroMv,
-                              kZeroMv, kZeroMv, kZeroMv, kZeroMv };
-      int subblock_mses[16] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX,
-                                INT_MAX, INT_MAX, INT_MAX, INT_MAX,
-                                INT_MAX, INT_MAX, INT_MAX, INT_MAX,
-                                INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+      MV subblock_mvs[NUM_16X16] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+                                     kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+                                     kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+                                     kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+      int subblock_mses[NUM_16X16] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+                                       INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+                                       INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+                                       INT_MAX, INT_MAX, INT_MAX, INT_MAX };
       int is_dc_diff_large = 0;
       int is_low_cntras = 0;