Commit 98ac57476a for aom

commit 98ac57476a46b65689bbed19320c9c055c730f2a
Author: Li Zhang <li.zhang2@arm.com>
Date:   Mon Mar 30 11:13:57 2026 +0200

    Arm: Improve av1_apply_temporal_filter

    Add a SIMD path for luma SSE sum calculation when color format is 420 in
    the av1_apply_temporal_filter Neon and Neon Dotprod implementation.

    Change-Id: If73f5c4567833e04907eea572b39c1b9c8e3e43e

diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index ba158d9550..1c68b57bf4 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -280,13 +280,30 @@ void av1_apply_temporal_filter_neon(
     // will be more accurate. The luma sse sum is reused in both chroma
     // planes.
     if (plane == AOM_PLANE_U) {
-      for (unsigned int i = 0; i < plane_h; i++) {
-        for (unsigned int j = 0; j < plane_w; j++) {
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+      if (ss_x_shift == 1 && ss_y_shift == 1) {
+        for (unsigned int i = 0; i < plane_h; ++i) {
+          const uint16_t *src = &frame_sse[2 * i * SSE_STRIDE + 2];
+          uint32_t *dst = luma_sse_sum + i * BW;
+
+          for (unsigned int j = 0; j < plane_w; j += 4) {
+            const uint16x8_t s0 = vld1q_u16(src + j * 2);
+            const uint16x8_t s1 = vld1q_u16(src + SSE_STRIDE + j * 2);
+
+            uint32x4_t sum = vpaddlq_u16(s0);
+            sum = vpadalq_u16(sum, s1);
+
+            vst1q_u32(dst + j, sum);
+          }
+        }
+      } else {
+        for (unsigned int i = 0; i < plane_h; i++) {
+          for (unsigned int j = 0; j < plane_w; j++) {
+            for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+              for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+                const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+                const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+                luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+              }
             }
           }
         }
diff --git a/av1/encoder/arm/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
index 71ece3ea40..b16c871f28 100644
--- a/av1/encoder/arm/temporal_filter_neon_dotprod.c
+++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -302,15 +302,37 @@ void av1_apply_temporal_filter_neon_dotprod(
     // will be more accurate. The luma sse sum is reused in both chroma
     // planes.
     if (plane == AOM_PLANE_U) {
-      for (unsigned int i = 0; i < plane_h; i++) {
-        for (unsigned int j = 0; j < plane_w; j++) {
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              luma_sse_sum[i * BW + j] +=
-                  (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
-                   frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+      if (ss_x_shift == 1 && ss_y_shift == 1) {
+        for (unsigned int i = 0; i < plane_h; ++i) {
+          const uint8_t *src = &frame_abs_diff[2 * i * SSE_STRIDE + 2];
+          uint32_t *dst = luma_sse_sum + i * BW;
+
+          for (unsigned int j = 0; j < plane_w; j += 8) {
+            const uint8x16_t s0 = vld1q_u8(src + j * 2);
+            const uint8x16_t s1 = vld1q_u8(src + SSE_STRIDE + j * 2);
+
+            uint8x16x2_t tmp = vzipq_u8(s0, s1);
+
+            const uint32x4_t sum0 =
+                vdotq_u32(vdupq_n_u32(0), tmp.val[0], tmp.val[0]);
+            const uint32x4_t sum1 =
+                vdotq_u32(vdupq_n_u32(0), tmp.val[1], tmp.val[1]);
+
+            vst1q_u32(dst + j, sum0);
+            vst1q_u32(dst + j + 4, sum1);
+          }
+        }
+      } else {
+        for (unsigned int i = 0; i < plane_h; i++) {
+          for (unsigned int j = 0; j < plane_w; j++) {
+            for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+              for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+                const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+                const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+                luma_sse_sum[i * BW + j] +=
+                    (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
+                     frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+              }
             }
           }
         }