Commit 73741777a5 for aom
commit 73741777a5431c63484c7f7cb5f4eb95fa63f212
Author: Diksha Singh <diksha.singh@ittiam.com>
Date: Mon Mar 9 22:43:11 2026 +0530
Enable AVX2 and SSE2 for av1_apply_temporal_filter()
The AVX2 and SSE2 implementations are modified for TF_BLOCK_SIZE of 64x64.
Scaling w.r.t C are as follows:
tf_wgt_calc_lvl=0 tf_wgt_calc_lvl=1
AVX2 7.9x 14.9x
SSE2 6.1x 7.8x
Change-Id: I34cb2b9625f7bc2847c1bd35a562120224a94d8f
diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index 132045f24c..3f62af6659 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -192,6 +192,8 @@ static void apply_temporal_filter(
}
}
+// TODO: bug aomedia:493082083 - Modify this function to support TF_BLOCK_SIZE
+// of 64x64.
void av1_apply_temporal_filter_neon(
const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -199,6 +201,13 @@ void av1_apply_temporal_filter_neon(
const int *subblock_mses, const int q_factor, const int filter_strength,
int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
uint16_t *count) {
+ if (block_size == BLOCK_64X64) {
+ av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
+ mb_col, num_planes, noise_levels, subblock_mvs,
+ subblock_mses, q_factor, filter_strength,
+ tf_wgt_calc_lvl, pred, accum, count);
+ return;
+ }
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
diff --git a/av1/encoder/arm/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
index c8fd699008..fc6a1252d3 100644
--- a/av1/encoder/arm/temporal_filter_neon_dotprod.c
+++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -214,6 +214,8 @@ static void apply_temporal_filter(
}
}
+// TODO: bug aomedia:493082083 - Modify this function to support TF_BLOCK_SIZE
+// of 64x64.
void av1_apply_temporal_filter_neon_dotprod(
const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -221,6 +223,13 @@ void av1_apply_temporal_filter_neon_dotprod(
const int *subblock_mses, const int q_factor, const int filter_strength,
int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
uint16_t *count) {
+ if (block_size == BLOCK_64X64) {
+ av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
+ mb_col, num_planes, noise_levels, subblock_mvs,
+ subblock_mses, q_factor, filter_strength,
+ tf_wgt_calc_lvl, pred, accum, count);
+ return;
+ }
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 3840d26de7..bf827f187a 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -46,9 +46,6 @@
// NOTE: All `tf` in this file means `temporal filtering`.
-// Number of 16x16 blocks within one 64x64 TF block.
-#define NUM_16X16 16
-
// Forward Declaration.
static void tf_determine_block_partition(const MV block_mv, const int block_mse,
const MV *midblock_mvs,
@@ -1176,7 +1173,7 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
#endif // CONFIG_AV1_HIGHBITDEPTH
} else {
// for 8-bit
- if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_32X32 &&
+ if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_64X64 &&
TF_WINDOW_LENGTH == 5) {
av1_apply_temporal_filter(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 3f93ab03af..d0108c41b7 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -26,8 +26,8 @@ struct ThreadData;
// TODO(wtc): These two variables are only used in avx2, sse2, neon
// implementations, where the block size is still hard coded to TF_BLOCK_SIZE.
// This should be fixed to align with the c implementation.
-#define BH 32
-#define BW 32
+#define BH 64
+#define BW 64
// Block size used in temporal filtering.
#define TF_BLOCK_SIZE BLOCK_64X64
@@ -35,6 +35,9 @@ struct ThreadData;
// Window size for temporal filtering.
#define TF_WINDOW_LENGTH 5
+// Number of 16x16 blocks within one 64x64 TF block.
+#define NUM_16X16 16
+
// A constant number, sqrt(pi / 2), used for noise estimation.
static const double SQRT_PI_BY_2 = 1.25331413732;
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index f3d85e123b..45d563be18 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -229,62 +229,35 @@ double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height,
return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
}
-static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+static AOM_FORCE_INLINE void get_squared_error_avx2(
const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
const unsigned int stride2, const int block_width, const int block_height,
uint16_t *frame_sse, const unsigned int sse_stride) {
- (void)block_width;
const uint8_t *src1 = frame1;
const uint8_t *src2 = frame2;
uint16_t *dst = frame_sse;
for (int i = 0; i < block_height; i++) {
- __m128i vf1_128, vf2_128;
- __m256i vf1, vf2, vdiff1, vsqdiff1;
-
- vf1_128 = _mm_loadu_si128((__m128i *)(src1));
- vf2_128 = _mm_loadu_si128((__m128i *)(src2));
- vf1 = _mm256_cvtepu8_epi16(vf1_128);
- vf2 = _mm256_cvtepu8_epi16(vf2_128);
- vdiff1 = _mm256_sub_epi16(vf1, vf2);
- vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
-
- _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
- // Set zero to uninitialized memory to avoid uninitialized loads later
- *(int *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
-
- src1 += stride, src2 += stride2;
- dst += sse_stride;
- }
-}
-
-static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
- const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
- const unsigned int stride2, const int block_width, const int block_height,
- uint16_t *frame_sse, const unsigned int sse_stride) {
- (void)block_width;
- const uint8_t *src1 = frame1;
- const uint8_t *src2 = frame2;
- uint16_t *dst = frame_sse;
- for (int i = 0; i < block_height; i++) {
- __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
-
- vsrc1 = _mm256_loadu_si256((__m256i *)src1);
- vsrc2 = _mm256_loadu_si256((__m256i *)src2);
- vmax = _mm256_max_epu8(vsrc1, vsrc2);
- vmin = _mm256_min_epu8(vsrc1, vsrc2);
- vdiff = _mm256_subs_epu8(vmax, vmin);
-
- __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
- __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
- vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
- vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
-
- vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
- vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
- _mm256_storeu_si256((__m256i *)(dst), vres1);
- _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
+ for (int j = 0; j < block_width; j += 32) {
+ __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
+
+ vsrc1 = _mm256_loadu_si256((__m256i *)(src1 + j));
+ vsrc2 = _mm256_loadu_si256((__m256i *)(src2 + j));
+ vmax = _mm256_max_epu8(vsrc1, vsrc2);
+ vmin = _mm256_min_epu8(vsrc1, vsrc2);
+ vdiff = _mm256_subs_epu8(vmax, vmin);
+
+ __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
+ __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
+ vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
+ vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
+
+ vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+ vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
+ _mm256_storeu_si256((__m256i *)(dst + j), vres1);
+ _mm256_storeu_si256((__m256i *)(dst + 16 + j), vres2);
+ }
// Set zero to uninitialized memory to avoid uninitialized loads later
- *(int *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
+ *(int *)(dst + block_width) = _mm_cvtsi128_si32(_mm_setzero_si128());
src1 += stride;
src2 += stride2;
@@ -351,18 +324,13 @@ static void apply_temporal_filter(
const double inv_num_ref_pixels, const double decay_factor,
const double inv_factor, const double weight_factor, double *d_factor,
int tf_wgt_calc_lvl) {
- assert(((block_width == 16) || (block_width == 32)) &&
- ((block_height == 16) || (block_height == 32)));
+ assert(((block_width == 64) || (block_width == 32)) &&
+ ((block_height == 64) || (block_height == 32)));
uint32_t acc_5x5_sse[BH][BW];
- if (block_width == 32) {
- get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
- block_height, frame_sse, SSE_STRIDE);
- } else {
- get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
- block_height, frame_sse, SSE_STRIDE);
- }
+ get_squared_error_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
__m256i vsrc[5];
@@ -409,21 +377,28 @@ static void apply_temporal_filter(
}
}
- double subblock_mses_scaled[4];
- double d_factor_decayed[4];
- for (int idx = 0; idx < 4; idx++) {
+ double subblock_mses_scaled[NUM_16X16];
+ double d_factor_decayed[NUM_16X16];
+ for (int idx = 0; idx < NUM_16X16; idx++) {
subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
d_factor_decayed[idx] = d_factor[idx] * decay_factor;
}
if (tf_wgt_calc_lvl == 0) {
for (int i = 0, k = 0; i < block_height; i++) {
- const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+ const int y16_blk_raster_offset =
+ ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
for (int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame2[i * stride2 + j];
uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const int x32_blk_raster_offset = (j >= (block_width >> 1));
+ const int x16_blk_raster_offset =
+ ((j % (block_width >> 1)) >= (block_width >> 2));
+ const int subblock_idx =
+ ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+ (y16_blk_raster_offset + x16_blk_raster_offset);
const double combined_error =
weight_factor * window_error + subblock_mses_scaled[subblock_idx];
@@ -436,8 +411,8 @@ static void apply_temporal_filter(
}
}
} else {
- __m256d subblock_mses_reg[4];
- __m256d d_factor_mul_n_decay_qr_invs[4];
+ __m256d subblock_mses_reg[NUM_16X16];
+ __m256d d_factor_mul_n_decay_qr_invs[NUM_16X16];
const __m256 zero = _mm256_set1_ps(0.0f);
const __m256 point_five = _mm256_set1_ps(0.5f);
const __m256 seven = _mm256_set1_ps(7.0f);
@@ -445,17 +420,15 @@ static void apply_temporal_filter(
const __m256d weight_factor_256bit = _mm256_set1_pd(weight_factor);
const __m256 tf_weight_scale = _mm256_set1_ps((float)TF_WEIGHT_SCALE);
// Maintain registers to hold mse and d_factor at subblock level.
- subblock_mses_reg[0] = _mm256_set1_pd(subblock_mses_scaled[0]);
- subblock_mses_reg[1] = _mm256_set1_pd(subblock_mses_scaled[1]);
- subblock_mses_reg[2] = _mm256_set1_pd(subblock_mses_scaled[2]);
- subblock_mses_reg[3] = _mm256_set1_pd(subblock_mses_scaled[3]);
- d_factor_mul_n_decay_qr_invs[0] = _mm256_set1_pd(d_factor_decayed[0]);
- d_factor_mul_n_decay_qr_invs[1] = _mm256_set1_pd(d_factor_decayed[1]);
- d_factor_mul_n_decay_qr_invs[2] = _mm256_set1_pd(d_factor_decayed[2]);
- d_factor_mul_n_decay_qr_invs[3] = _mm256_set1_pd(d_factor_decayed[3]);
+ for (int i = 0; i < NUM_16X16; i++) {
+ subblock_mses_reg[i] = _mm256_set1_pd(subblock_mses_scaled[i]);
+ d_factor_mul_n_decay_qr_invs[i] = _mm256_set1_pd(d_factor_decayed[i]);
+ }
for (int i = 0; i < block_height; i++) {
- const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+ const int y16_blk_raster_offset =
+ ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
uint32_t *luma_sse_sum_temp = luma_sse_sum + i * BW;
for (int j = 0; j < block_width; j += 8) {
const __m256i acc_sse =
@@ -477,9 +450,13 @@ static void apply_temporal_filter(
const __m256d window_error_2 =
_mm256_mul_pd(diff_sse_pd_2, inv_num_ref_pixel_256bit);
- // const int subblock_idx = y_blk_raster_offset + (j >= block_width /
- // 2);
- const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ // const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
+ const int x32_blk_raster_offset = (j >= (block_width >> 1));
+ const int x16_blk_raster_offset =
+ ((j % (block_width >> 1)) >= (block_width >> 2));
+ const int subblock_idx =
+ ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+ (y16_blk_raster_offset + x16_blk_raster_offset);
const __m256d blk_error = subblock_mses_reg[subblock_idx];
// const double combined_error =
@@ -555,7 +532,7 @@ void av1_apply_temporal_filter_avx2(
int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
uint16_t *count) {
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
- assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
+ assert(block_size == BLOCK_64X64 && "Only support 64x64 block with avx2!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
assert(!is_high_bitdepth && "Only support low bit-depth with avx2!");
assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -584,16 +561,20 @@ void av1_apply_temporal_filter_avx2(
// Smaller strength -> smaller filtering weight.
double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
s_decay = CLIP(s_decay, 1e-5, 1);
- double d_factor[4] = { 0 };
- uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
- uint32_t luma_sse_sum[BW * BH] = { 0 };
-
- for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ double d_factor[NUM_16X16] = { 0 };
+ uint16_t *frame_sse =
+ (uint16_t *)aom_memalign(32, sizeof(frame_sse[0]) * SSE_STRIDE * BH);
+ uint32_t *luma_sse_sum =
+ (uint32_t *)aom_memalign(32, sizeof(luma_sse_sum[0]) * BW * BH);
+ memset(frame_sse, 0, sizeof(frame_sse[0]) * SSE_STRIDE * BH);
+ memset(luma_sse_sum, 0, sizeof(luma_sse_sum[0]) * BW * BH);
+
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
// Larger motion vector -> smaller filtering weight.
const MV mv = subblock_mvs[subblock_idx];
const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
- double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
- distance_threshold = AOMMAX(distance_threshold, 1);
d_factor[subblock_idx] = distance / distance_threshold;
d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
}
@@ -624,13 +605,35 @@ void av1_apply_temporal_filter_avx2(
// will be more accurate. The luma sse sum is reused in both chroma
// planes.
if (plane == AOM_PLANE_U) {
- for (unsigned int i = 0, k = 0; i < plane_h; i++) {
- for (unsigned int j = 0; j < plane_w; j++, k++) {
- for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
- for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
- const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
- const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
- luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+ if (ss_x_shift == 1 && ss_y_shift == 1) {
+ const __m256i zero_reg = _mm256_setzero_si256();
+ for (unsigned int i = 0; i < plane_h; i++) {
+ const uint16_t *src_0 = &frame_sse[2 * i * SSE_STRIDE];
+ const uint16_t *src_1 = &frame_sse[(2 * i + 1) * SSE_STRIDE];
+ for (unsigned int j = 0; j < plane_w; j += 8) {
+ const __m256i reg0 = _mm256_loadu_si256((__m256i *)(src_0 + j * 2));
+ const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src_1 + j * 2));
+
+ const __m256i reg0_lo = _mm256_unpacklo_epi16(reg0, zero_reg);
+ const __m256i reg0_hi = _mm256_unpackhi_epi16(reg0, zero_reg);
+ const __m256i reg1_lo = _mm256_unpacklo_epi16(reg1, zero_reg);
+ const __m256i reg1_hi = _mm256_unpackhi_epi16(reg1, zero_reg);
+
+ const __m256i reg_0 = _mm256_add_epi32(reg0_lo, reg1_lo);
+ const __m256i reg_1 = _mm256_add_epi32(reg0_hi, reg1_hi);
+ const __m256i res = _mm256_hadd_epi32(reg_0, reg_1);
+ _mm256_storeu_si256((__m256i *)&luma_sse_sum[i * BW + j], res);
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+ }
}
}
}
@@ -644,4 +647,6 @@ void av1_apply_temporal_filter_avx2(
weight_factor, d_factor, tf_wgt_calc_lvl);
plane_offset += plane_h * plane_w;
}
+ aom_free(frame_sse);
+ aom_free(luma_sse_sum);
}
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 8a8c94719e..7fcea7e450 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -110,8 +110,8 @@ static void apply_temporal_filter(
const double inv_num_ref_pixels, const double decay_factor,
const double inv_factor, const double weight_factor, double *d_factor,
int tf_wgt_calc_lvl) {
- assert(((block_width == 16) || (block_width == 32)) &&
- ((block_height == 16) || (block_height == 32)));
+ assert(((block_width == 64) || (block_width == 32)) &&
+ ((block_height == 64) || (block_height == 32)));
uint32_t acc_5x5_sse[BH][BW];
@@ -170,21 +170,28 @@ static void apply_temporal_filter(
}
}
- double subblock_mses_scaled[4];
- double d_factor_decayed[4];
- for (int idx = 0; idx < 4; idx++) {
+ double subblock_mses_scaled[NUM_16X16];
+ double d_factor_decayed[NUM_16X16];
+ for (int idx = 0; idx < NUM_16X16; idx++) {
subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
d_factor_decayed[idx] = d_factor[idx] * decay_factor;
}
if (tf_wgt_calc_lvl == 0) {
for (int i = 0, k = 0; i < block_height; i++) {
- const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+ const int y16_blk_raster_offset =
+ ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
for (int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame2[i * stride2 + j];
uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const int x32_blk_raster_offset = (j >= (block_width >> 1));
+ const int x16_blk_raster_offset =
+ ((j % (block_width >> 1)) >= (block_width >> 2));
+ const int subblock_idx =
+ ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+ (y16_blk_raster_offset + x16_blk_raster_offset);
const double combined_error =
weight_factor * window_error + subblock_mses_scaled[subblock_idx];
@@ -198,13 +205,20 @@ static void apply_temporal_filter(
}
} else {
for (int i = 0, k = 0; i < block_height; i++) {
- const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+ const int y16_blk_raster_offset =
+ ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
for (int j = 0; j < block_width; j++, k++) {
const int pixel_value = frame2[i * stride2 + j];
uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
const double window_error = diff_sse * inv_num_ref_pixels;
- const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const int x32_blk_raster_offset = (j >= (block_width >> 1));
+ const int x16_blk_raster_offset =
+ ((j % (block_width >> 1)) >= (block_width >> 2));
+ const int subblock_idx =
+ ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+ (y16_blk_raster_offset + x16_blk_raster_offset);
const double combined_error =
weight_factor * window_error + subblock_mses_scaled[subblock_idx];
@@ -228,7 +242,7 @@ void av1_apply_temporal_filter_sse2(
int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
uint16_t *count) {
const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
- assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+ assert(block_size == BLOCK_64X64 && "Only support 64x64 block with sse2!");
assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
assert(!is_high_bitdepth && "Only support low bit-depth with sse2!");
assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -257,16 +271,20 @@ void av1_apply_temporal_filter_sse2(
// Smaller strength -> smaller filtering weight.
double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
s_decay = CLIP(s_decay, 1e-5, 1);
- double d_factor[4] = { 0 };
- uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
- uint32_t luma_sse_sum[BW * BH] = { 0 };
-
- for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ double d_factor[NUM_16X16] = { 0 };
+ uint16_t *frame_sse =
+ (uint16_t *)aom_memalign(32, sizeof(frame_sse[0]) * SSE_STRIDE * BH);
+ uint32_t *luma_sse_sum =
+ (uint32_t *)aom_memalign(32, sizeof(luma_sse_sum[0]) * BW * BH);
+ memset(frame_sse, 0, sizeof(frame_sse[0]) * SSE_STRIDE * BH);
+ memset(luma_sse_sum, 0, sizeof(luma_sse_sum[0]) * BW * BH);
+
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
// Larger motion vector -> smaller filtering weight.
const MV mv = subblock_mvs[subblock_idx];
const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
- double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
- distance_threshold = AOMMAX(distance_threshold, 1);
d_factor[subblock_idx] = distance / distance_threshold;
d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
}
@@ -317,4 +335,6 @@ void av1_apply_temporal_filter_sse2(
weight_factor, d_factor, tf_wgt_calc_lvl);
plane_offset += plane_h * plane_w;
}
+ aom_free(frame_sse);
+ aom_free(luma_sse_sum);
}
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 8d9afaa99c..c9b4224adb 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -134,6 +134,7 @@ void TemporalFilterTest::RunTest(int isRandom, int run_times,
static_assert(block_size == BLOCK_64X64, "");
const int width = 64;
const int height = 64;
+ const int pels = width * height;
int num_planes = MAX_MB_PLANE;
int subsampling_x = 0;
int subsampling_y = 0;
@@ -165,18 +166,23 @@ void TemporalFilterTest::RunTest(int isRandom, int run_times,
}
double sigma[MAX_MB_PLANE] = { 2.1002103677063437, 2.1002103677063437,
2.1002103677063437 };
- DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
- DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
- memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
- memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
- DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
- DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
- memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
- memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+ DECLARE_ALIGNED(16, unsigned int, accumulator_ref[pels * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count_ref[pels * 3]);
+ memset(accumulator_ref, 0, pels * 3 * sizeof(accumulator_ref[0]));
+ memset(count_ref, 0, pels * 3 * sizeof(count_ref[0]));
+ DECLARE_ALIGNED(16, unsigned int, accumulator_mod[pels * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count_mod[pels * 3]);
+ memset(accumulator_mod, 0, pels * 3 * sizeof(accumulator_mod[0]));
+ memset(count_mod, 0, pels * 3 * sizeof(count_mod[0]));
static_assert(width == 64 && height == 64, "");
- const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
- const int subblock_mses[4] = { 15, 16, 17, 18 };
+ const MV subblock_mvs[NUM_16X16] = {
+ { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 }, { 0, 0 }, { 5, 5 },
+ { 7, 8 }, { 2, 10 }, { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 },
+ { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 }
+ };
+ const int subblock_mses[NUM_16X16] = { 15, 16, 17, 18, 15, 16, 17, 18,
+ 15, 16, 17, 18, 15, 16, 17, 18 };
const int q_factor = 12;
const int filter_strength = 5;
const int mb_row = 0;
@@ -190,10 +196,10 @@ void TemporalFilterTest::RunTest(int isRandom, int run_times,
frame_to_filter->heights[PLANE_TYPE_UV] = height >> subsampling_y;
frame_to_filter->strides[PLANE_TYPE_Y] = stride;
frame_to_filter->strides[PLANE_TYPE_UV] = stride >> subsampling_x;
- DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
+ DECLARE_ALIGNED(16, uint8_t, src[pels * 3]);
frame_to_filter->buffer_alloc = src;
frame_to_filter->flags = 0; // Only support low bit-depth test.
- memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
+ memcpy(src, src1_, pels * 3 * sizeof(uint8_t));
std::unique_ptr<MACROBLOCKD> mbd(new (std::nothrow) MACROBLOCKD);
ASSERT_NE(mbd, nullptr);
@@ -284,25 +290,21 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
RunTest(1, 100000, I444);
}
-// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
-// needs to be updated.
-// #if HAVE_AVX2
-// TemporalFilterFuncParam temporal_filter_test_avx2[] = {
-// TemporalFilterFuncParam(
-// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
-// INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
-// Combine(ValuesIn(temporal_filter_test_avx2),
-// Values(0, 1)));
-// #endif // HAVE_AVX2
-//
-// #if HAVE_SSE2
-// TemporalFilterFuncParam temporal_filter_test_sse2[] = {
-// TemporalFilterFuncParam(
-// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
-// INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
-// Combine(ValuesIn(temporal_filter_test_sse2),
-// Values(0, 1)));
-// #endif // HAVE_SSE2
+#if HAVE_AVX2
+TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
+ &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_avx2),
+ Values(0, 1)));
+#endif // HAVE_AVX2
+
+#if HAVE_SSE2
+TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
+ &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
+ Combine(ValuesIn(temporal_filter_test_sse2),
+ Values(0, 1)));
+#endif // HAVE_SSE2
// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
// needs to be updated.