Commit bd15da94b0 for aom

commit bd15da94b034046a76a6ce9dbcb5137210244bb0
Author: Li Zhang <li.zhang2@arm.com>
Date:   Mon Mar 30 11:13:57 2026 +0200

    Arm: Enable Neon and Neon Dotprod for av1_apply_temporal_filter

    Update av1_apply_temporal_filter Neon and Neon Dotprod to work with
    TF_BLOCK_SIZE of 64x64.

    Bug: aomedia:493082083
    Change-Id: I5f32ec403b5370fe94ddfaef6f4b121fc6cd20c5

diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index 3f62af6659..ba158d9550 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -83,8 +83,8 @@ static void apply_temporal_filter(
     const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
     const double decay_factor, const double inv_factor,
     const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
-  assert(((block_width == 16) || (block_width == 32)) &&
-         ((block_height == 16) || (block_height == 32)));
+  assert(((block_width == 64) || (block_width == 32)) &&
+         ((block_height == 64) || (block_height == 32)));

   uint32_t diff_sse[BH][BW];
   const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask);
@@ -149,12 +149,16 @@ static void apply_temporal_filter(
   // Perform filtering.
   if (tf_wgt_calc_lvl == 0) {
     for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      const int y32 = i / (block_height / 2);
+      const int y16 = (i % (block_height / 2)) / (block_height / 4);
+
       for (unsigned int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame[i * stride + j];

         const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
-        const int subblock_idx =
-            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const int x32 = j / (block_width / 2);
+        const int x16 = (j % (block_width / 2)) / (block_width / 4);
+        const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
         const double block_error = (double)subblock_mses[subblock_idx];
         const double combined_error =
             weight_factor * window_error + block_error * inv_factor;
@@ -169,12 +173,16 @@ static void apply_temporal_filter(
     }
   } else {
     for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      const int y32 = i / (block_height / 2);
+      const int y16 = (i % (block_height / 2)) / (block_height / 4);
+
       for (unsigned int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame[i * stride + j];

         const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
-        const int subblock_idx =
-            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const int x32 = j / (block_width / 2);
+        const int x16 = (j % (block_width / 2)) / (block_width / 4);
+        const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
         const double block_error = (double)subblock_mses[subblock_idx];
         const double combined_error =
             weight_factor * window_error + block_error * inv_factor;
@@ -192,8 +200,6 @@ static void apply_temporal_filter(
   }
 }

-// TODO: bug aomedia:493082083 - Modify this function to support TF_BLOCK_SIZE
-// of 64x64.
 void av1_apply_temporal_filter_neon(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -201,15 +207,8 @@ void av1_apply_temporal_filter_neon(
     const int *subblock_mses, const int q_factor, const int filter_strength,
     int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
     uint16_t *count) {
-  if (block_size == BLOCK_64X64) {
-    av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
-                                mb_col, num_planes, noise_levels, subblock_mvs,
-                                subblock_mses, q_factor, filter_strength,
-                                tf_wgt_calc_lvl, pred, accum, count);
-    return;
-  }
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
-  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+  assert(block_size == BLOCK_64X64 && "Only support 64x64 block with Neon!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
   assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -240,16 +239,16 @@ void av1_apply_temporal_filter_neon(
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
-  double d_factor[4] = { 0 };
+  double d_factor[NUM_16X16] = { 0 };
   uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
   uint32_t luma_sse_sum[BW * BH] = { 0 };

-  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+  double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+  distance_threshold = AOMMAX(distance_threshold, 1);
+  for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
     // Larger motion vector -> smaller filtering weight.
     const MV mv = subblock_mvs[subblock_idx];
     const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
-    distance_threshold = AOMMAX(distance_threshold, 1);
     d_factor[subblock_idx] = distance / distance_threshold;
     d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
   }
diff --git a/av1/encoder/arm/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
index fc6a1252d3..71ece3ea40 100644
--- a/av1/encoder/arm/temporal_filter_neon_dotprod.c
+++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -71,8 +71,8 @@ static void apply_temporal_filter(
     const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
     const double decay_factor, const double inv_factor,
     const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
-  assert(((block_width == 16) || (block_width == 32)) &&
-         ((block_height == 16) || (block_height == 32)));
+  assert(((block_width == 64) || (block_width == 32)) &&
+         ((block_height == 64) || (block_height == 32)));

   uint32_t diff_sse[BH][BW];
   const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
@@ -171,12 +171,16 @@ static void apply_temporal_filter(
   // Perform filtering.
   if (tf_wgt_calc_lvl == 0) {
     for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      const int y32 = i / (block_height / 2);
+      const int y16 = (i % (block_height / 2)) / (block_height / 4);
+
       for (unsigned int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame[i * stride + j];

         const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
-        const int subblock_idx =
-            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const int x32 = j / (block_width / 2);
+        const int x16 = (j % (block_width / 2)) / (block_width / 4);
+        const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
         const double block_error = (double)subblock_mses[subblock_idx];
         const double combined_error =
             weight_factor * window_error + block_error * inv_factor;
@@ -191,12 +195,16 @@ static void apply_temporal_filter(
     }
   } else {
     for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      const int y32 = i / (block_height / 2);
+      const int y16 = (i % (block_height / 2)) / (block_height / 4);
+
       for (unsigned int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame[i * stride + j];

         const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
-        const int subblock_idx =
-            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+        const int x32 = j / (block_width / 2);
+        const int x16 = (j % (block_width / 2)) / (block_width / 4);
+        const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
         const double block_error = (double)subblock_mses[subblock_idx];
         const double combined_error =
             weight_factor * window_error + block_error * inv_factor;
@@ -214,8 +222,6 @@ static void apply_temporal_filter(
   }
 }

-// TODO: bug aomedia:493082083 - Modify this function to support TF_BLOCK_SIZE
-// of 64x64.
 void av1_apply_temporal_filter_neon_dotprod(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -223,15 +229,8 @@ void av1_apply_temporal_filter_neon_dotprod(
     const int *subblock_mses, const int q_factor, const int filter_strength,
     int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
     uint16_t *count) {
-  if (block_size == BLOCK_64X64) {
-    av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
-                                mb_col, num_planes, noise_levels, subblock_mvs,
-                                subblock_mses, q_factor, filter_strength,
-                                tf_wgt_calc_lvl, pred, accum, count);
-    return;
-  }
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
-  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+  assert(block_size == BLOCK_64X64 && "Only support 64x64 block with Neon!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
   assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -262,16 +261,16 @@ void av1_apply_temporal_filter_neon_dotprod(
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
-  double d_factor[4] = { 0 };
+  double d_factor[NUM_16X16] = { 0 };
   uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
   uint32_t luma_sse_sum[BW * BH] = { 0 };

-  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+  double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+  distance_threshold = AOMMAX(distance_threshold, 1);
+  for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
     // Larger motion vector -> smaller filtering weight.
     const MV mv = subblock_mvs[subblock_idx];
     const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
-    distance_threshold = AOMMAX(distance_threshold, 1);
     d_factor[subblock_idx] = distance / distance_threshold;
     d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
   }
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 7a6a330def..2d17c2edfe 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -325,26 +325,23 @@ INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
                                  Values(0, 1)));
 #endif  // HAVE_SSE2

-// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
-// needs to be updated.
-// #if HAVE_NEON
-// TemporalFilterFuncParam temporal_filter_test_neon[] = {
-// TemporalFilterFuncParam(
-//    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
-// INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
-//                         Combine(ValuesIn(temporal_filter_test_neon),
-//                                 Values(0, 1)));
-// #endif  // HAVE_NEON
+#if HAVE_NEON
+TemporalFilterFuncParam temporal_filter_test_neon[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
+INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_neon),
+                                 Values(0, 1)));
+#endif  // HAVE_NEON

-// #if HAVE_NEON_DOTPROD
-// TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
-//   TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
-//                           &av1_apply_temporal_filter_neon_dotprod)
-// };
-// INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
-//                          Combine(ValuesIn(temporal_filter_test_neon_dotprod),
-//                                  Values(0, 1)));
-// #endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_DOTPROD
+TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
+  TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
+                          &av1_apply_temporal_filter_neon_dotprod)
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_neon_dotprod),
+                                 Values(0, 1)));
+#endif  // HAVE_NEON_DOTPROD

 #if HAVE_AVX2 || HAVE_NEON
 // Width and height for which av1_estimate_noise_from_single_plane() will be