Commit d962719cee for aom
commit d962719ceea77453707862c8100c9f94220a3bf5
Author: Li Zhang <li.zhang2@arm.com>
Date: Wed Apr 15 12:23:29 2026 +0000
Arm: Enable Neon for av1_highbd_apply_temporal_filter
Enable av1_highbd_apply_temporal_filter_neon to support TF_BLOCK_SIZE of
64x64. Also add a specialized path for 420 U/V-plane luma sse sum
calculation.
Bug: 493082083
Change-Id: I6e20f438b5e637c27d64281ef1a3d1fae0ed2c9a
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
index 7ae126e0b2..0a93c1298c 100644
--- a/aom_dsp/arm/sum_neon.h
+++ b/aom_dsp/arm/sum_neon.h
@@ -242,6 +242,16 @@ static inline uint32_t horizontal_add_u16x4(const uint16x4_t a) {
#endif
}
+static inline uint32x4_t horizontal_add_2d_u32(uint32x4_t a, uint32x4_t b) {
+#if AOM_ARCH_AARCH64
+ return vpaddq_u32(a, b);
+#else
+ const uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a));
+ const uint32x2_t b0 = vpadd_u32(vget_low_u32(b), vget_high_u32(b));
+ return vcombine_u32(a0, b0);
+#endif
+}
+
static inline int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
#if AOM_ARCH_AARCH64
return vpaddq_s32(a, b);
diff --git a/av1/encoder/arm/highbd_temporal_filter_neon.c b/av1/encoder/arm/highbd_temporal_filter_neon.c
index dddac6ad6f..f89c961cd2 100644
--- a/av1/encoder/arm/highbd_temporal_filter_neon.c
+++ b/av1/encoder/arm/highbd_temporal_filter_neon.c
@@ -85,8 +85,8 @@ static void highbd_apply_temporal_filter(
const double decay_factor, const double inv_factor,
const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl,
int bd) {
- assert(((block_width == 16) || (block_width == 32)) &&
- ((block_height == 16) || (block_height == 32)));
+ assert(((block_width == 64) || (block_width == 32)) &&
+ ((block_height == 64) || (block_height == 32)));
uint32_t acc_5x5_neon[BH][BW] = { 0 };
const int half_window = TF_WINDOW_LENGTH >> 1;
@@ -233,6 +233,9 @@ static void highbd_apply_temporal_filter(
// Perform filtering.
if (tf_wgt_calc_lvl == 0) {
for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+ const int y16_blk_raster_offset =
+ ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
for (unsigned int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame[i * stride + j];
// Scale down the difference for high bit depth input.
@@ -240,8 +243,13 @@ static void highbd_apply_temporal_filter(
(acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
const double window_error = diff_sse * inv_num_ref_pixels;
+ const int x32_blk_raster_offset = j >= (block_width >> 1);
+ const int x16_blk_raster_offset =
+ (j % (block_width >> 1)) >= (block_width >> 2);
const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+ y16_blk_raster_offset + x16_blk_raster_offset;
+
const double block_error = (double)subblock_mses[subblock_idx];
const double combined_error =
weight_factor * window_error + block_error * inv_factor;
@@ -256,6 +264,9 @@ static void highbd_apply_temporal_filter(
}
} else {
for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+ const int y16_blk_raster_offset =
+ ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
for (unsigned int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame[i * stride + j];
// Scale down the difference for high bit depth input.
@@ -263,8 +274,13 @@ static void highbd_apply_temporal_filter(
(acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
const double window_error = diff_sse * inv_num_ref_pixels;
+ const int x32_blk_raster_offset = j >= (block_width >> 1);
+ const int x16_blk_raster_offset =
+ (j % (block_width >> 1)) >= (block_width >> 2);
const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+ y16_blk_raster_offset + x16_blk_raster_offset;
+
const double block_error = (double)subblock_mses[subblock_idx];
const double combined_error =
weight_factor * window_error + block_error * inv_factor;
@@ -289,14 +305,8 @@ void av1_highbd_apply_temporal_filter_neon(
const int *subblock_mses, const int q_factor, const int filter_strength,
int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum,
uint16_t *count) {
- if (block_size == BLOCK_64X64) {
- av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
- mb_col, num_planes, noise_levels, subblock_mvs,
- subblock_mses, q_factor, filter_strength,
- tf_wgt_calc_lvl, pred8, accum, count);
- return;
- }
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_64X64 && "Only support 64x64 block with Neon!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
(void)is_high_bitdepth;
@@ -327,17 +337,17 @@ void av1_highbd_apply_temporal_filter_neon(
// Smaller strength -> smaller filtering weight.
double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
s_decay = CLIP(s_decay, 1e-5, 1);
- double d_factor[4] = { 0 };
+ double d_factor[NUM_16X16] = { 0 };
uint32_t frame_sse[BW * BH] = { 0 };
uint32_t luma_sse_sum[BW * BH] = { 0 };
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
// Larger motion vector -> smaller filtering weight.
const MV mv = subblock_mvs[subblock_idx];
const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
- double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
- distance_threshold = AOMMAX(distance_threshold, 1);
d_factor[subblock_idx] = distance / distance_threshold;
d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
}
@@ -349,7 +359,6 @@ void av1_highbd_apply_temporal_filter_neon(
const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
const uint32_t frame_stride =
frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
- const uint32_t frame_sse_stride = plane_w;
const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
const uint16_t *ref =
@@ -371,28 +380,47 @@ void av1_highbd_apply_temporal_filter_neon(
// will be more accurate. The luma sse sum is reused in both chroma
// planes.
if (plane == AOM_PLANE_U) {
- for (unsigned int i = 0; i < plane_h; i++) {
- for (unsigned int j = 0; j < plane_w; j++) {
- for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
- for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
- const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
- const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
- const int ww = frame_sse_stride
- << ss_x_shift; // Width of Y-plane.
- luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx];
+ if (ss_x_shift == 1 && ss_y_shift == 1) {
+ for (unsigned int i = 0; i < plane_h; ++i) {
+ const uint32_t *src = &frame_sse[2 * i * BW];
+ uint32_t *dst = luma_sse_sum + i * BW;
+
+ for (unsigned int j = 0; j < plane_w; j += 4) {
+ const uint32x4_t s0_lo = vld1q_u32(src + j * 2);
+ const uint32x4_t s0_hi = vld1q_u32(src + j * 2 + 4);
+ const uint32x4_t s1_lo = vld1q_u32(src + BW + j * 2);
+ const uint32x4_t s1_hi = vld1q_u32(src + BW + j * 2 + 4);
+
+ uint32x4_t sum0 = horizontal_add_2d_u32(s0_lo, s0_hi);
+ uint32x4_t sum1 = horizontal_add_2d_u32(s1_lo, s1_hi);
+
+ sum0 = vaddq_u32(sum0, sum1);
+
+ vst1q_u32(dst + j, sum0);
+ }
+ }
+ } else {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * BW + xx];
+ }
}
}
}
}
}
get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
- plane_h, frame_sse, frame_sse_stride);
+ plane_h, frame_sse, BW);
highbd_apply_temporal_filter(
pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses,
- accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride,
- luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor,
- weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd);
+ accum + plane_offset, count + plane_offset, frame_sse, BW, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor,
+ tf_wgt_calc_lvl, mbd->bd);
plane_offset += plane_h * plane_w;
}
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 2d17c2edfe..4b433226ea 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -718,17 +718,15 @@ INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
Values(0, 1)));
#endif // HAVE_AVX2
-// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
-// needs to be updated.
-// #if HAVE_NEON
-// HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
-// HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
-// &av1_highbd_apply_temporal_filter_neon)
-//};
-// INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
-// Combine(ValuesIn(HBDtemporal_filter_test_neon),
-// Values(0, 1)));
-// #endif // HAVE_NEON
+#if HAVE_NEON
+HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
+ HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+ &av1_highbd_apply_temporal_filter_neon)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
+ Combine(ValuesIn(HBDtemporal_filter_test_neon),
+ Values(0, 1)));
+#endif // HAVE_NEON
using HBDEstimateNoiseFunc = double (*)(const uint16_t *src, int height,
int width, int stride, int bit_depth,