Commit bd15da94b0 for aom
commit bd15da94b034046a76a6ce9dbcb5137210244bb0
Author: Li Zhang <li.zhang2@arm.com>
Date: Mon Mar 30 11:13:57 2026 +0200
Arm: Enable Neon and Neon Dotprod for av1_apply_temporal_filter
Update av1_apply_temporal_filter Neon and Neon Dotprod to work with
TF_BLOCK_SIZE of 64x64.
Bug: aomedia:493082083
Change-Id: I5f32ec403b5370fe94ddfaef6f4b121fc6cd20c5
diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index 3f62af6659..ba158d9550 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -83,8 +83,8 @@ static void apply_temporal_filter(
const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
const double decay_factor, const double inv_factor,
const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
- assert(((block_width == 16) || (block_width == 32)) &&
- ((block_height == 16) || (block_height == 32)));
+ assert(((block_width == 64) || (block_width == 32)) &&
+ ((block_height == 64) || (block_height == 32)));
uint32_t diff_sse[BH][BW];
const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask);
@@ -149,12 +149,16 @@ static void apply_temporal_filter(
// Perform filtering.
if (tf_wgt_calc_lvl == 0) {
for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ const int y32 = i / (block_height / 2);
+ const int y16 = (i % (block_height / 2)) / (block_height / 4);
+
for (unsigned int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame[i * stride + j];
const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const int x32 = j / (block_width / 2);
+ const int x16 = (j % (block_width / 2)) / (block_width / 4);
+ const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
const double block_error = (double)subblock_mses[subblock_idx];
const double combined_error =
weight_factor * window_error + block_error * inv_factor;
@@ -169,12 +173,16 @@ static void apply_temporal_filter(
}
} else {
for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ const int y32 = i / (block_height / 2);
+ const int y16 = (i % (block_height / 2)) / (block_height / 4);
+
for (unsigned int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame[i * stride + j];
const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const int x32 = j / (block_width / 2);
+ const int x16 = (j % (block_width / 2)) / (block_width / 4);
+ const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
const double block_error = (double)subblock_mses[subblock_idx];
const double combined_error =
weight_factor * window_error + block_error * inv_factor;
@@ -192,8 +200,6 @@ static void apply_temporal_filter(
}
}
-// TODO: bug aomedia:493082083 - Modify this function to support TF_BLOCK_SIZE
-// of 64x64.
void av1_apply_temporal_filter_neon(
const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -201,15 +207,8 @@ void av1_apply_temporal_filter_neon(
const int *subblock_mses, const int q_factor, const int filter_strength,
int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
uint16_t *count) {
- if (block_size == BLOCK_64X64) {
- av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
- mb_col, num_planes, noise_levels, subblock_mvs,
- subblock_mses, q_factor, filter_strength,
- tf_wgt_calc_lvl, pred, accum, count);
- return;
- }
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
- assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+ assert(block_size == BLOCK_64X64 && "Only support 64x64 block with Neon!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -240,16 +239,16 @@ void av1_apply_temporal_filter_neon(
// Smaller strength -> smaller filtering weight.
double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
s_decay = CLIP(s_decay, 1e-5, 1);
- double d_factor[4] = { 0 };
+ double d_factor[NUM_16X16] = { 0 };
uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
uint32_t luma_sse_sum[BW * BH] = { 0 };
- for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
// Larger motion vector -> smaller filtering weight.
const MV mv = subblock_mvs[subblock_idx];
const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
- double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
- distance_threshold = AOMMAX(distance_threshold, 1);
d_factor[subblock_idx] = distance / distance_threshold;
d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
}
diff --git a/av1/encoder/arm/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
index fc6a1252d3..71ece3ea40 100644
--- a/av1/encoder/arm/temporal_filter_neon_dotprod.c
+++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -71,8 +71,8 @@ static void apply_temporal_filter(
const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
const double decay_factor, const double inv_factor,
const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
- assert(((block_width == 16) || (block_width == 32)) &&
- ((block_height == 16) || (block_height == 32)));
+ assert(((block_width == 64) || (block_width == 32)) &&
+ ((block_height == 64) || (block_height == 32)));
uint32_t diff_sse[BH][BW];
const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
@@ -171,12 +171,16 @@ static void apply_temporal_filter(
// Perform filtering.
if (tf_wgt_calc_lvl == 0) {
for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ const int y32 = i / (block_height / 2);
+ const int y16 = (i % (block_height / 2)) / (block_height / 4);
+
for (unsigned int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame[i * stride + j];
const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const int x32 = j / (block_width / 2);
+ const int x16 = (j % (block_width / 2)) / (block_width / 4);
+ const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
const double block_error = (double)subblock_mses[subblock_idx];
const double combined_error =
weight_factor * window_error + block_error * inv_factor;
@@ -191,12 +195,16 @@ static void apply_temporal_filter(
}
} else {
for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ const int y32 = i / (block_height / 2);
+ const int y16 = (i % (block_height / 2)) / (block_height / 4);
+
for (unsigned int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame[i * stride + j];
const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
- const int subblock_idx =
- (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const int x32 = j / (block_width / 2);
+ const int x16 = (j % (block_width / 2)) / (block_width / 4);
+ const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
const double block_error = (double)subblock_mses[subblock_idx];
const double combined_error =
weight_factor * window_error + block_error * inv_factor;
@@ -214,8 +222,6 @@ static void apply_temporal_filter(
}
}
-// TODO: bug aomedia:493082083 - Modify this function to support TF_BLOCK_SIZE
-// of 64x64.
void av1_apply_temporal_filter_neon_dotprod(
const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -223,15 +229,8 @@ void av1_apply_temporal_filter_neon_dotprod(
const int *subblock_mses, const int q_factor, const int filter_strength,
int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
uint16_t *count) {
- if (block_size == BLOCK_64X64) {
- av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
- mb_col, num_planes, noise_levels, subblock_mvs,
- subblock_mses, q_factor, filter_strength,
- tf_wgt_calc_lvl, pred, accum, count);
- return;
- }
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
- assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+ assert(block_size == BLOCK_64X64 && "Only support 64x64 block with Neon!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -262,16 +261,16 @@ void av1_apply_temporal_filter_neon_dotprod(
// Smaller strength -> smaller filtering weight.
double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
s_decay = CLIP(s_decay, 1e-5, 1);
- double d_factor[4] = { 0 };
+ double d_factor[NUM_16X16] = { 0 };
uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
uint32_t luma_sse_sum[BW * BH] = { 0 };
- for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
// Larger motion vector -> smaller filtering weight.
const MV mv = subblock_mvs[subblock_idx];
const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
- double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
- distance_threshold = AOMMAX(distance_threshold, 1);
d_factor[subblock_idx] = distance / distance_threshold;
d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
}
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 7a6a330def..2d17c2edfe 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -325,26 +325,23 @@ INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
Values(0, 1)));
#endif // HAVE_SSE2
-// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
-// needs to be updated.
-// #if HAVE_NEON
-// TemporalFilterFuncParam temporal_filter_test_neon[] = {
-// TemporalFilterFuncParam(
-// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
-// INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
-// Combine(ValuesIn(temporal_filter_test_neon),
-// Values(0, 1)));
-// #endif // HAVE_NEON
+#if HAVE_NEON
+TemporalFilterFuncParam temporal_filter_test_neon[] = { TemporalFilterFuncParam(
+ &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
+INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_neon),
+ Values(0, 1)));
+#endif // HAVE_NEON
-// #if HAVE_NEON_DOTPROD
-// TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
-// TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
-// &av1_apply_temporal_filter_neon_dotprod)
-// };
-// INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
-// Combine(ValuesIn(temporal_filter_test_neon_dotprod),
-// Values(0, 1)));
-// #endif // HAVE_NEON_DOTPROD
+#if HAVE_NEON_DOTPROD
+TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
+ TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
+ &av1_apply_temporal_filter_neon_dotprod)
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_neon_dotprod),
+ Values(0, 1)));
+#endif // HAVE_NEON_DOTPROD
#if HAVE_AVX2 || HAVE_NEON
// Width and height for which av1_estimate_noise_from_single_plane() will be