Commit d962719cee for aom

commit d962719ceea77453707862c8100c9f94220a3bf5
Author: Li Zhang <li.zhang2@arm.com>
Date:   Wed Apr 15 12:23:29 2026 +0000

    Arm: Enable Neon for av1_highbd_apply_temporal_filter

    Enable av1_highbd_apply_temporal_filter_neon to support TF_BLOCK_SIZE of
    64x64. Also add a specialized path for 420 U/V-plane luma sse sum
    calculation.

    Bug: 493082083
    Change-Id: I6e20f438b5e637c27d64281ef1a3d1fae0ed2c9a

diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index 7ae126e0b2..0a93c1298c 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -242,6 +242,16 @@ static inline uint32_t horizontal_add_u16x4(const uint16x4_t a) {
 #endif
 }

+static inline uint32x4_t horizontal_add_2d_u32(uint32x4_t a, uint32x4_t b) {
+#if AOM_ARCH_AARCH64
+  return vpaddq_u32(a, b);
+#else
+  const uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a));
+  const uint32x2_t b0 = vpadd_u32(vget_low_u32(b), vget_high_u32(b));
+  return vcombine_u32(a0, b0);
+#endif
+}
+
 static inline int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
 #if AOM_ARCH_AARCH64
   return vpaddq_s32(a, b);
diff --git a/av1/encoder/arm/highbd_temporal_filter_neon.c b/av1/encoder/arm/highbd_temporal_filter_neon.c
index dddac6ad6f..f89c961cd2 100644
--- a/av1/encoder/arm/highbd_temporal_filter_neon.c
+++ b/av1/encoder/arm/highbd_temporal_filter_neon.c
@@ -85,8 +85,8 @@ static void highbd_apply_temporal_filter(
     const double decay_factor, const double inv_factor,
     const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl,
     int bd) {
-  assert(((block_width == 16) || (block_width == 32)) &&
-         ((block_height == 16) || (block_height == 32)));
+  assert(((block_width == 64) || (block_width == 32)) &&
+         ((block_height == 64) || (block_height == 32)));

   uint32_t acc_5x5_neon[BH][BW] = { 0 };
   const int half_window = TF_WINDOW_LENGTH >> 1;
@@ -233,6 +233,9 @@ static void highbd_apply_temporal_filter(
   // Perform filtering.
   if (tf_wgt_calc_lvl == 0) {
     for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+      const int y16_blk_raster_offset =
+          ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
       for (unsigned int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame[i * stride + j];
         // Scale down the difference for high bit depth input.
@@ -240,8 +243,13 @@ static void highbd_apply_temporal_filter(
             (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);

         const double window_error = diff_sse * inv_num_ref_pixels;
+        const int x32_blk_raster_offset = j >= (block_width >> 1);
+        const int x16_blk_raster_offset =
+            (j % (block_width >> 1)) >= (block_width >> 2);
         const int subblock_idx =
-            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+            ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+            y16_blk_raster_offset + x16_blk_raster_offset;
+
         const double block_error = (double)subblock_mses[subblock_idx];
         const double combined_error =
             weight_factor * window_error + block_error * inv_factor;
@@ -256,6 +264,9 @@ static void highbd_apply_temporal_filter(
     }
   } else {
     for (unsigned int i = 0, k = 0; i < block_height; i++) {
+      const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+      const int y16_blk_raster_offset =
+          ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
       for (unsigned int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame[i * stride + j];
         // Scale down the difference for high bit depth input.
@@ -263,8 +274,13 @@ static void highbd_apply_temporal_filter(
             (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);

         const double window_error = diff_sse * inv_num_ref_pixels;
+        const int x32_blk_raster_offset = j >= (block_width >> 1);
+        const int x16_blk_raster_offset =
+            (j % (block_width >> 1)) >= (block_width >> 2);
         const int subblock_idx =
-            (i >= block_height / 2) * 2 + (j >= block_width / 2);
+            ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+            y16_blk_raster_offset + x16_blk_raster_offset;
+
         const double block_error = (double)subblock_mses[subblock_idx];
         const double combined_error =
             weight_factor * window_error + block_error * inv_factor;
@@ -289,14 +305,8 @@ void av1_highbd_apply_temporal_filter_neon(
     const int *subblock_mses, const int q_factor, const int filter_strength,
     int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum,
     uint16_t *count) {
-  if (block_size == BLOCK_64X64) {
-    av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
-                                mb_col, num_planes, noise_levels, subblock_mvs,
-                                subblock_mses, q_factor, filter_strength,
-                                tf_wgt_calc_lvl, pred8, accum, count);
-    return;
-  }
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+  assert(block_size == BLOCK_64X64 && "Only support 64x64 block with Neon!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
   (void)is_high_bitdepth;
@@ -327,17 +337,17 @@ void av1_highbd_apply_temporal_filter_neon(
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
-  double d_factor[4] = { 0 };
+  double d_factor[NUM_16X16] = { 0 };
   uint32_t frame_sse[BW * BH] = { 0 };
   uint32_t luma_sse_sum[BW * BH] = { 0 };
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);

-  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+  double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+  distance_threshold = AOMMAX(distance_threshold, 1);
+  for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
     // Larger motion vector -> smaller filtering weight.
     const MV mv = subblock_mvs[subblock_idx];
     const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
-    distance_threshold = AOMMAX(distance_threshold, 1);
     d_factor[subblock_idx] = distance / distance_threshold;
     d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
   }
@@ -349,7 +359,6 @@ void av1_highbd_apply_temporal_filter_neon(
     const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
     const uint32_t frame_stride =
         frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
-    const uint32_t frame_sse_stride = plane_w;
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;

     const uint16_t *ref =
@@ -371,28 +380,47 @@ void av1_highbd_apply_temporal_filter_neon(
     // will be more accurate. The luma sse sum is reused in both chroma
     // planes.
     if (plane == AOM_PLANE_U) {
-      for (unsigned int i = 0; i < plane_h; i++) {
-        for (unsigned int j = 0; j < plane_w; j++) {
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              const int ww = frame_sse_stride
-                             << ss_x_shift;  // Width of Y-plane.
-              luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx];
+      if (ss_x_shift == 1 && ss_y_shift == 1) {
+        for (unsigned int i = 0; i < plane_h; ++i) {
+          const uint32_t *src = &frame_sse[2 * i * BW];
+          uint32_t *dst = luma_sse_sum + i * BW;
+
+          for (unsigned int j = 0; j < plane_w; j += 4) {
+            const uint32x4_t s0_lo = vld1q_u32(src + j * 2);
+            const uint32x4_t s0_hi = vld1q_u32(src + j * 2 + 4);
+            const uint32x4_t s1_lo = vld1q_u32(src + BW + j * 2);
+            const uint32x4_t s1_hi = vld1q_u32(src + BW + j * 2 + 4);
+
+            uint32x4_t sum0 = horizontal_add_2d_u32(s0_lo, s0_hi);
+            uint32x4_t sum1 = horizontal_add_2d_u32(s1_lo, s1_hi);
+
+            sum0 = vaddq_u32(sum0, sum1);
+
+            vst1q_u32(dst + j, sum0);
+          }
+        }
+      } else {
+        for (unsigned int i = 0; i < plane_h; i++) {
+          for (unsigned int j = 0; j < plane_w; j++) {
+            for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+              for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+                const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+                const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+                luma_sse_sum[i * BW + j] += frame_sse[yy * BW + xx];
+              }
             }
           }
         }
       }
     }
     get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
-                      plane_h, frame_sse, frame_sse_stride);
+                      plane_h, frame_sse, BW);

     highbd_apply_temporal_filter(
         pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses,
-        accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride,
-        luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor,
-        weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd);
+        accum + plane_offset, count + plane_offset, frame_sse, BW, luma_sse_sum,
+        inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor,
+        tf_wgt_calc_lvl, mbd->bd);

     plane_offset += plane_h * plane_w;
   }
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 2d17c2edfe..4b433226ea 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -718,17 +718,15 @@ INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
                                  Values(0, 1)));
 #endif  // HAVE_AVX2

-// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
-// needs to be updated.
-// #if HAVE_NEON
-// HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
-//  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
-//                             &av1_highbd_apply_temporal_filter_neon)
-//};
-// INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
-//                         Combine(ValuesIn(HBDtemporal_filter_test_neon),
-//                                 Values(0, 1)));
-// #endif  // HAVE_NEON
+#if HAVE_NEON
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
+  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+                             &av1_highbd_apply_temporal_filter_neon)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
+                         Combine(ValuesIn(HBDtemporal_filter_test_neon),
+                                 Values(0, 1)));
+#endif  // HAVE_NEON

 using HBDEstimateNoiseFunc = double (*)(const uint16_t *src, int height,
                                         int width, int stride, int bit_depth,