Commit 98ac57476a for aom
commit 98ac57476a46b65689bbed19320c9c055c730f2a
Author: Li Zhang <li.zhang2@arm.com>
Date: Mon Mar 30 11:13:57 2026 +0200
Arm: Improve av1_apply_temporal_filter
Add a SIMD path for luma SSE sum calculation when color format is 420 in
the av1_apply_temporal_filter Neon and Neon Dotprod implementation.
Change-Id: If73f5c4567833e04907eea572b39c1b9c8e3e43e
diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index ba158d9550..1c68b57bf4 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -280,13 +280,30 @@ void av1_apply_temporal_filter_neon(
// will be more accurate. The luma sse sum is reused in both chroma
// planes.
if (plane == AOM_PLANE_U) {
- for (unsigned int i = 0; i < plane_h; i++) {
- for (unsigned int j = 0; j < plane_w; j++) {
- for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
- for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
- const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
- const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
- luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ if (ss_x_shift == 1 && ss_y_shift == 1) {
+ for (unsigned int i = 0; i < plane_h; ++i) {
+ const uint16_t *src = &frame_sse[2 * i * SSE_STRIDE + 2];
+ uint32_t *dst = luma_sse_sum + i * BW;
+
+ for (unsigned int j = 0; j < plane_w; j += 4) {
+ const uint16x8_t s0 = vld1q_u16(src + j * 2);
+ const uint16x8_t s1 = vld1q_u16(src + SSE_STRIDE + j * 2);
+
+ uint32x4_t sum = vpaddlq_u16(s0);
+ sum = vpadalq_u16(sum, s1);
+
+ vst1q_u32(dst + j, sum);
+ }
+ }
+ } else {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
}
}
}
diff --git a/av1/encoder/arm/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
index 71ece3ea40..b16c871f28 100644
--- a/av1/encoder/arm/temporal_filter_neon_dotprod.c
+++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -302,15 +302,37 @@ void av1_apply_temporal_filter_neon_dotprod(
// will be more accurate. The luma sse sum is reused in both chroma
// planes.
if (plane == AOM_PLANE_U) {
- for (unsigned int i = 0; i < plane_h; i++) {
- for (unsigned int j = 0; j < plane_w; j++) {
- for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
- for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
- const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
- const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
- luma_sse_sum[i * BW + j] +=
- (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
- frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+ if (ss_x_shift == 1 && ss_y_shift == 1) {
+ for (unsigned int i = 0; i < plane_h; ++i) {
+ const uint8_t *src = &frame_abs_diff[2 * i * SSE_STRIDE + 2];
+ uint32_t *dst = luma_sse_sum + i * BW;
+
+ for (unsigned int j = 0; j < plane_w; j += 8) {
+ const uint8x16_t s0 = vld1q_u8(src + j * 2);
+ const uint8x16_t s1 = vld1q_u8(src + SSE_STRIDE + j * 2);
+
+ uint8x16x2_t tmp = vzipq_u8(s0, s1);
+
+ const uint32x4_t sum0 =
+ vdotq_u32(vdupq_n_u32(0), tmp.val[0], tmp.val[0]);
+ const uint32x4_t sum1 =
+ vdotq_u32(vdupq_n_u32(0), tmp.val[1], tmp.val[1]);
+
+ vst1q_u32(dst + j, sum0);
+ vst1q_u32(dst + j + 4, sum1);
+ }
+ }
+ } else {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] +=
+ (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
+ frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+ }
}
}
}