Commit 6a64bb332e for aom
commit 6a64bb332e059756bf5c9f1f164bfe74cfe2c0db
Author: Yunqing Wang <yunqingwang@google.com>
Date: Thu Feb 26 14:16:42 2026 -0800
Temporal filter improvement
The hierarchical TF block sizes are added for better temporal
filter performance.
For HD set, this gives coding gain: -0.4% SSIM, -0.07% PSNR,
-0.21% VMAF/VMAF_neg.
SIMD for 64x64 block TF function will be added.
STATS_CHANGED
Change-Id: Id7628b6105f19fff0ac16491ad36b9ea5009df01
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 4437110e3a..58dbf70090 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -48,8 +48,90 @@
// Forward Declaration.
static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+ const MV *midblock_mvs,
+ const int *midblock_mses,
MV *subblock_mvs, int *subblock_mses);
+// Do motion search for 1 subblock. The block size can be 32x32 or 16x16.
+static void subblock_motion_search(
+ AV1_COMP *cpi, MACROBLOCK *mb, const YV12_BUFFER_CONFIG *frame_to_filter,
+ const YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE tf_block_size, int mb_row,
+ int mb_col, BLOCK_SIZE subblock_size, int idx, int subblock_ofst_i,
+ int subblock_ofst_j, FULLPEL_MOTION_SEARCH_PARAMS *full_ms_params,
+ SUBPEL_MOTION_SEARCH_PARAMS *ms_params, const MV *ref_mv,
+ FULLPEL_MV start_mv, const search_site_config *search_site_cfg,
+ SEARCH_METHODS search_method, MV_COST_TYPE mv_cost_type, int step_param,
+ SUBPEL_SEARCH_TYPE subpel_ms_type, MV *subblock_mvs, int *subblock_mses,
+ FULLPEL_MV_STATS *best_mv_stats, int q) {
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+
+ const int mb_height = block_size_high[tf_block_size];
+ const int mb_width = block_size_wide[tf_block_size];
+ const int mi_h = mi_size_high_log2[tf_block_size];
+ const int mi_w = mi_size_wide_log2[tf_block_size];
+
+ const int y_stride = frame_to_filter->y_stride;
+ assert(y_stride == ref_frame->y_stride);
+ const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+ const int subblock_height = block_size_high[subblock_size];
+ const int subblock_width = block_size_wide[subblock_size];
+ const int subblock_pels = subblock_height * subblock_width;
+
+ int_mv best_mv; // Searched motion vector.
+ unsigned int sse, error;
+ int distortion;
+ int cost_list[5];
+
+ av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_row << mi_h) + (subblock_ofst_i >> MI_SIZE_LOG2),
+ (subblock_height >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_col << mi_w) + (subblock_ofst_j >> MI_SIZE_LOG2),
+ (subblock_width >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ const int boffset = subblock_ofst_i * y_stride + subblock_ofst_j;
+ mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + boffset;
+ mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + boffset;
+ av1_make_default_fullpel_ms_params(full_ms_params, cpi, mb, subblock_size,
+ ref_mv, start_mv, search_site_cfg,
+ search_method,
+ /*fine_search_interval=*/0);
+ full_ms_params->run_mesh_search = 1;
+ full_ms_params->mv_cost_params.mv_cost_type = mv_cost_type;
+
+ // May need to re-tune prune_mesh_search after increase TF
+ // block to 64x64.
+ if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+ // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+ full_ms_params->prune_mesh_search = (q <= 20) ? 0 : 1;
+ full_ms_params->mesh_search_mv_diff_threshold = 2;
+ }
+
+ av1_full_pixel_search(start_mv, full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
+ best_mv_stats, NULL);
+
+ av1_make_default_subpel_ms_params(ms_params, cpi, mb, subblock_size, ref_mv,
+ cost_list);
+ ms_params->forced_stop = EIGHTH_PEL;
+ ms_params->var_params.subpel_search_type = subpel_ms_type;
+ // Since we are merely refining the result from full pixel
+ // search, we don't need regularization for subpel search
+ ms_params->mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats->err_cost = 0;
+
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params->mv_limits, subpel_start_mv));
+ error = cpi->mv_search_params.find_fractional_mv_step(
+ &mb->e_mbd, &cpi->common, ms_params, subpel_start_mv, best_mv_stats,
+ &best_mv.as_mv, &distortion, &sse, NULL);
+
+ subblock_mses[idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+ subblock_mvs[idx] = best_mv.as_mv;
+}
+
// This function returns the minimum and maximum log variances for 4x4 sub
// blocks in the current block.
static inline void get_log_var_4x4sub_blk(
@@ -236,6 +318,10 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
FULLPEL_MV_STATS best_mv_stats;
int block_mse = INT_MAX;
MV block_mv = kZeroMv;
+ // 32x32 block motion search results.
+ int midblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ MV midblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+
const int q = get_q(cpi);
av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
@@ -289,54 +375,49 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
*is_dc_diff_large = 50 * error < sse;
if (src_var <= 2 * (int64_t)distortion) *is_low_cntras = 1;
+ // On 4 mid-blocks in the 64x64 tf block.
if (allow_me_for_sub_blks) {
- // On 4 sub-blocks.
- const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
- const int subblock_height = block_size_high[subblock_size];
- const int subblock_width = block_size_wide[subblock_size];
- const int subblock_pels = subblock_height * subblock_width;
- start_mv = get_fullmv_from_mv(ref_mv);
-
- int subblock_idx = 0;
- for (int i = 0; i < mb_height; i += subblock_height) {
- for (int j = 0; j < mb_width; j += subblock_width) {
- const int offset = i * y_stride + j;
- mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
- mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
- av1_make_default_fullpel_ms_params(
- &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv,
- search_site_cfg, search_method,
- /*fine_search_interval=*/0);
- full_ms_params.run_mesh_search = 1;
- full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
-
- if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
- // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
- full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
- full_ms_params.mesh_search_mv_diff_threshold = 2;
+ // midblock_size is 32x32, which corresponds to each 32x32 block in the
+ // 64x64 block.
+ const BLOCK_SIZE midblock_size = av1_ss_size_lookup[block_size][1][1];
+ const int midblock_height = block_size_high[midblock_size];
+ const int midblock_width = block_size_wide[midblock_size];
+ int midblock_idx = 0;
+ for (int i = 0; i < mb_height; i += midblock_height) {
+ for (int j = 0; j < mb_width; j += midblock_width) {
+ start_mv = get_fullmv_from_mv(ref_mv);
+
+ subblock_motion_search(
+ cpi, mb, frame_to_filter, ref_frame, block_size, mb_row, mb_col,
+ midblock_size, midblock_idx, i, j, &full_ms_params, &ms_params,
+ &baseline_mv, start_mv, search_site_cfg, search_method,
+ mv_cost_type, step_param, subpel_search_type, midblock_mvs,
+ midblock_mses, &best_mv_stats, q);
+
+ // On 4 sub-blocks in 1 mid-block.
+ {
+ const BLOCK_SIZE subblock_size =
+ av1_ss_size_lookup[midblock_size][1][1];
+ const int subblock_height = block_size_high[subblock_size];
+ const int subblock_width = block_size_wide[subblock_size];
+
+ start_mv = get_fullmv_from_mv(&midblock_mvs[midblock_idx]);
+
+ int bidx = midblock_idx * 4;
+ for (int bi = 0; bi < midblock_height; bi += subblock_height) {
+ for (int bj = 0; bj < midblock_width; bj += subblock_width) {
+ subblock_motion_search(
+ cpi, mb, frame_to_filter, ref_frame, block_size, mb_row,
+ mb_col, subblock_size, bidx, (i + bi), (j + bj),
+ &full_ms_params, &ms_params, &baseline_mv, start_mv,
+ search_site_cfg, search_method, mv_cost_type, step_param,
+ subpel_search_type, subblock_mvs, subblock_mses,
+ &best_mv_stats, q);
+ ++bidx;
+ }
+ }
}
- av1_full_pixel_search(start_mv, &full_ms_params, step_param,
- cond_cost_list(cpi, cost_list),
- &best_mv.as_fullmv, &best_mv_stats, NULL);
-
- av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
- &baseline_mv, cost_list);
- ms_params.forced_stop = EIGHTH_PEL;
- ms_params.var_params.subpel_search_type = subpel_search_type;
- // Since we are merely refining the result from full pixel search, we
- // don't need regularization for subpel search
- ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
- best_mv_stats.err_cost = 0;
-
- subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
- assert(
- av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
- error = cpi->mv_search_params.find_fractional_mv_step(
- &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
- &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL);
- subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
- subblock_mvs[subblock_idx] = best_mv.as_mv;
- ++subblock_idx;
+ ++midblock_idx;
}
}
}
@@ -348,11 +429,11 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
// Make partition decision.
if (allow_me_for_sub_blks) {
- tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
- subblock_mses);
+ tf_determine_block_partition(block_mv, block_mse, midblock_mvs,
+ midblock_mses, subblock_mvs, subblock_mses);
} else {
- // Copy 32X32 block mv and mse values to sub blocks
- for (int i = 0; i < 4; ++i) {
+ // Copy 64X64 block mv and mse values to sub blocks
+ for (int i = 0; i < 16; ++i) {
subblock_mvs[i] = block_mv;
subblock_mses[i] = block_mse;
}
@@ -372,6 +453,8 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
// block_mv: Motion vector for the entire block (ONLY as reference).
// block_mse: Motion search error (MSE) for the entire block (ONLY as
// reference).
+// midblock_mvs: Pointer to the motion vectors for 4 mid-blocks.
+// midblock_mses: Pointer to the search errors (MSE) for 4 mid-blocks.
// subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
// modified based on the partition decision).
// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
@@ -380,23 +463,53 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
// Nothing will be returned. Results are saved in `subblock_mvs` and
// `subblock_mses`.
static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+ const MV *midblock_mvs,
+ const int *midblock_mses,
MV *subblock_mvs, int *subblock_mses) {
int min_subblock_mse = INT_MAX;
int max_subblock_mse = INT_MIN;
int64_t sum_subblock_mse = 0;
- for (int i = 0; i < 4; ++i) {
+ int i;
+
+ // Go through 4 32x32 blocks.
+ for (int idx = 0; idx < 4; ++idx) {
+ min_subblock_mse = INT_MAX;
+ max_subblock_mse = INT_MIN;
+ sum_subblock_mse = 0;
+
+ const int sub_idx = idx * 4;
+ for (i = sub_idx; i < sub_idx + 4; ++i) {
+ sum_subblock_mse += subblock_mses[i];
+ min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
+ max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
+ }
+
+ if (((midblock_mses[idx] * 15 <= sum_subblock_mse * 4) &&
+ max_subblock_mse - min_subblock_mse < 48) ||
+ ((midblock_mses[idx] * 14 <= sum_subblock_mse * 4) &&
+ max_subblock_mse - min_subblock_mse < 24)) { // No split.
+ for (i = sub_idx; i < sub_idx + 4; ++i) {
+ subblock_mvs[i] = midblock_mvs[idx];
+ subblock_mses[i] = midblock_mses[idx];
+ }
+ }
+ }
+
+ min_subblock_mse = INT_MAX;
+ max_subblock_mse = INT_MIN;
+ sum_subblock_mse = 0;
+ for (i = 0; i < 16; ++i) {
sum_subblock_mse += subblock_mses[i];
min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
}
- // TODO(any): The following magic numbers may be tuned to improve the
- // performance OR find a way to get rid of these magic numbers.
- if (((block_mse * 15 < sum_subblock_mse * 4) &&
- max_subblock_mse - min_subblock_mse < 48) ||
- ((block_mse * 14 < sum_subblock_mse * 4) &&
- max_subblock_mse - min_subblock_mse < 24)) { // No split.
- for (int i = 0; i < 4; ++i) {
+ if (((block_mse * 15 <= sum_subblock_mse) &&
+ (max_subblock_mse - min_subblock_mse) * 16 < sum_subblock_mse * 3) ||
+ ((block_mse * 14 <= sum_subblock_mse) &&
+ (max_subblock_mse - min_subblock_mse) * 8 <
+ sum_subblock_mse)) { // No split.
+ for (i = 0; i < 16; ++i) {
subblock_mvs[i] = block_mv;
subblock_mses[i] = block_mse;
}
@@ -466,35 +579,45 @@ static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
const int plane_w = mb_width >> subsampling_x; // Plane width.
const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left).
const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left).
- const int h = plane_h >> 1; // Sub-block height.
- const int w = plane_w >> 1; // Sub-block width.
- const int is_y_plane = (plane == 0); // Is Y-plane?
+
+ const int h32 = plane_h >> 1; // 32x32 sub-block height.
+ const int w32 = plane_w >> 1; // 32x32 sub-block width.
+ const int h16 = plane_h >> 2; // 16x16 sub-block height.
+ const int w16 = plane_w >> 2; // 16x16 sub-block width.
+
+ const int is_y_plane = (plane == 0); // Is Y-plane?
const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
ref_frame->widths[is_y_plane ? 0 : 1],
ref_frame->heights[is_y_plane ? 0 : 1],
ref_frame->strides[is_y_plane ? 0 : 1] };
- // Handle each subblock.
- int subblock_idx = 0;
- for (int i = 0; i < plane_h; i += h) {
- for (int j = 0; j < plane_w; j += w) {
- // Choose proper motion vector.
- const MV mv = subblock_mvs[subblock_idx++];
- assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
- mv.col >= INT16_MIN && mv.col <= INT16_MAX);
-
- const int y = plane_y + i;
- const int x = plane_x + j;
-
- // Build predictior for each sub-block on current plane.
- InterPredParams inter_pred_params;
- av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
- subsampling_y, bit_depth, is_high_bitdepth,
- is_intrabc, scale, &ref_buf, interp_filters);
- inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
- av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
- plane_w, &mv, &inter_pred_params);
+ const int sub_y[4] = { 0, 0, h32, h32 };
+ const int sub_x[4] = { 0, w32, 0, w32 };
+ // Handle each 16x16 subblock.
+ for (int idx = 0; idx < 4; ++idx) {
+ int subblock_idx = idx * 4;
+ for (int i = 0; i < h32; i += h16) {
+ for (int j = 0; j < w32; j += w16) {
+ // Choose proper motion vector.
+ const MV mv = subblock_mvs[subblock_idx++];
+ assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
+ mv.col >= INT16_MIN && mv.col <= INT16_MAX);
+
+ const int y = plane_y + sub_y[idx] + i;
+ const int x = plane_x + sub_x[idx] + j;
+
+ // Build predictior for each sub-block on current plane.
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, w16, h16, y, x,
+ subsampling_x, subsampling_y, bit_depth,
+ is_high_bitdepth, is_intrabc, scale, &ref_buf,
+ interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ av1_enc_build_one_inter_predictor(
+ &pred[plane_offset + (sub_y[idx] + i) * plane_w + sub_x[idx] + j],
+ plane_w, &mv, &inter_pred_params);
+ }
}
}
plane_offset += plane_h * plane_w;
@@ -711,16 +834,6 @@ void av1_apply_temporal_filter_c(
const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
}
- double d_factor[4] = { 0 };
- for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
- // Larger motion vector -> smaller filtering weight.
- const MV mv = subblock_mvs[subblock_idx];
- const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
- double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
- distance_threshold = AOMMAX(distance_threshold, 1);
- d_factor[subblock_idx] = distance / distance_threshold;
- d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
- }
// Allocate memory for pixel-wise squared differences. They,
// regardless of the subsampling, are assigned with memory of size `mb_pels`.
@@ -796,14 +909,26 @@ void av1_apply_temporal_filter_c(
// Combine window error and block error, and normalize it.
const double window_error = sum_square_diff * inv_num_ref_pixels;
- const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+
+ // 16x16 block index
+ const int y32 = i / (h / 2);
+ const int x32 = j / (w / 2);
+ const int y16 = (i % (h / 2)) / (h / 4);
+ const int x16 = (j % (w / 2)) / (w / 4);
+ const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
const double block_error = (double)subblock_mses[subblock_idx];
const double combined_error =
weight_factor * window_error + block_error * inv_factor;
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ const double distance_threshold =
+ (double)AOMMAX(min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD, 1);
+ const double d_factor = AOMMAX(distance / distance_threshold, 1);
+
// Compute filter weight.
- double scaled_error =
- combined_error * d_factor[subblock_idx] * decay_factor[plane];
+ double scaled_error = combined_error * d_factor * decay_factor[plane];
scaled_error = AOMMIN(scaled_error, 7);
int weight;
if (tf_wgt_calc_lvl == 0) {
@@ -930,6 +1055,17 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
const GF_GROUP *gf_group = &cpi->ppi->gf_group;
const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ // Determine whether the video is with `YUV 4:2:2` format, since the avx2/sse2
+ // function only supports square block size. We will use C function instead
+ // for videos with `YUV 4:2:2` format.
+ int is_yuv422_format = 0;
+ for (int plane = 1; plane < num_planes; ++plane) {
+ if (mbd->plane[plane].subsampling_x != mbd->plane[plane].subsampling_y) {
+ is_yuv422_format = 1;
+ break;
+ }
+ }
+
// Do filtering.
FRAME_DIFF *diff = &td->tf_data.diff;
av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
@@ -967,8 +1103,16 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
if (frames[frame] == NULL) continue;
// Motion search.
- MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
- int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ // block size is 64x64. 16 16x16 in 1 64x64.
+ // Store motion search results in 16x16 units.
+ MV subblock_mvs[16] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+ kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+ kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+ kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+ int subblock_mses[16] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+ INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+ INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+ INT_MAX, INT_MAX, INT_MAX, INT_MAX };
int is_dc_diff_large = 0;
int is_low_cntras = 0;
@@ -1006,7 +1150,8 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
// only supports 32x32 block size and 5x5 filtering window.
if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth
#if CONFIG_AV1_HIGHBITDEPTH
- if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+ if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_32X32 &&
+ TF_WINDOW_LENGTH == 5) {
av1_highbd_apply_temporal_filter(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
noise_levels, subblock_mvs, subblock_mses, q_factor,
@@ -1022,7 +1167,8 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
#endif // CONFIG_AV1_HIGHBITDEPTH
} else {
// for 8-bit
- if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+ if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_32X32 &&
+ TF_WINDOW_LENGTH == 5) {
av1_apply_temporal_filter(
frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
noise_levels, subblock_mvs, subblock_mses, q_factor,
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 9585942c32..3f93ab03af 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -30,7 +30,7 @@ struct ThreadData;
#define BW 32
// Block size used in temporal filtering.
-#define TF_BLOCK_SIZE BLOCK_32X32
+#define TF_BLOCK_SIZE BLOCK_64X64
// Window size for temporal filtering.
#define TF_WINDOW_LENGTH 5
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 2e0ae0f3e6..8d9afaa99c 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -131,9 +131,9 @@ void TemporalFilterTest::RunTest(int isRandom, int run_times,
ColorFormat color_fmt) {
aom_usec_timer ref_timer, test_timer;
const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
- static_assert(block_size == BLOCK_32X32, "");
- const int width = 32;
- const int height = 32;
+ static_assert(block_size == BLOCK_64X64, "");
+ const int width = 64;
+ const int height = 64;
int num_planes = MAX_MB_PLANE;
int subsampling_x = 0;
int subsampling_y = 0;
@@ -174,7 +174,7 @@ void TemporalFilterTest::RunTest(int isRandom, int run_times,
memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
- static_assert(width == 32 && height == 32, "");
+ static_assert(width == 64 && height == 64, "");
const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
const int subblock_mses[4] = { 15, 16, 17, 18 };
const int q_factor = 12;
@@ -284,39 +284,46 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
RunTest(1, 100000, I444);
}
-#if HAVE_AVX2
-TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
- &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
-INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
- Combine(ValuesIn(temporal_filter_test_avx2),
- Values(0, 1)));
-#endif // HAVE_AVX2
-
-#if HAVE_SSE2
-TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
- &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
-INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
- Combine(ValuesIn(temporal_filter_test_sse2),
- Values(0, 1)));
-#endif // HAVE_SSE2
-
-#if HAVE_NEON
-TemporalFilterFuncParam temporal_filter_test_neon[] = { TemporalFilterFuncParam(
- &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
-INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
- Combine(ValuesIn(temporal_filter_test_neon),
- Values(0, 1)));
-#endif // HAVE_NEON
-
-#if HAVE_NEON_DOTPROD
-TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
- TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
- &av1_apply_temporal_filter_neon_dotprod)
-};
-INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
- Combine(ValuesIn(temporal_filter_test_neon_dotprod),
- Values(0, 1)));
-#endif // HAVE_NEON_DOTPROD
+// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
+// needs to be updated.
+// #if HAVE_AVX2
+// TemporalFilterFuncParam temporal_filter_test_avx2[] = {
+// TemporalFilterFuncParam(
+// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
+// INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
+// Combine(ValuesIn(temporal_filter_test_avx2),
+// Values(0, 1)));
+// #endif // HAVE_AVX2
+//
+// #if HAVE_SSE2
+// TemporalFilterFuncParam temporal_filter_test_sse2[] = {
+// TemporalFilterFuncParam(
+// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
+// INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
+// Combine(ValuesIn(temporal_filter_test_sse2),
+// Values(0, 1)));
+// #endif // HAVE_SSE2
+
+// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
+// needs to be updated.
+// #if HAVE_NEON
+// TemporalFilterFuncParam temporal_filter_test_neon[] = {
+// TemporalFilterFuncParam(
+// &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
+// INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
+// Combine(ValuesIn(temporal_filter_test_neon),
+// Values(0, 1)));
+// #endif // HAVE_NEON
+
+// #if HAVE_NEON_DOTPROD
+// TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
+// TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
+// &av1_apply_temporal_filter_neon_dotprod)
+// };
+// INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
+// Combine(ValuesIn(temporal_filter_test_neon_dotprod),
+// Values(0, 1)));
+// #endif // HAVE_NEON_DOTPROD
#if HAVE_AVX2 || HAVE_NEON
// Width and height for which av1_estimate_noise_from_single_plane() will be
@@ -514,9 +521,9 @@ void HBDTemporalFilterTest::RunTest(int isRandom, int run_times, int BD,
ColorFormat color_fmt) {
aom_usec_timer ref_timer, test_timer;
const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
- static_assert(block_size == BLOCK_32X32, "");
- const int width = 32;
- const int height = 32;
+ static_assert(block_size == BLOCK_64X64, "");
+ const int width = 64;
+ const int height = 64;
int num_planes = MAX_MB_PLANE;
int subsampling_x = 0;
int subsampling_y = 0;
@@ -557,7 +564,7 @@ void HBDTemporalFilterTest::RunTest(int isRandom, int run_times, int BD,
memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
- static_assert(width == 32 && height == 32, "");
+ static_assert(width == 64 && height == 64, "");
const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
const int subblock_mses[4] = { 15, 16, 17, 18 };
const int q_factor = 12;
@@ -667,34 +674,39 @@ TEST_P(HBDTemporalFilterTest, DISABLED_Speed) {
RunTest(1, 100000, 10, I422);
RunTest(1, 100000, 10, I444);
}
-#if HAVE_SSE2
-HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = {
- HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
- &av1_highbd_apply_temporal_filter_sse2)
-};
-INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
- Combine(ValuesIn(HBDtemporal_filter_test_sse2),
- Values(0, 1)));
-#endif // HAVE_SSE2
-#if HAVE_AVX2
-HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
- HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
- &av1_highbd_apply_temporal_filter_avx2)
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
- Combine(ValuesIn(HBDtemporal_filter_test_avx2),
- Values(0, 1)));
-#endif // HAVE_AVX2
-#if HAVE_NEON
-HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
- HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
- &av1_highbd_apply_temporal_filter_neon)
-};
-INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
- Combine(ValuesIn(HBDtemporal_filter_test_neon),
- Values(0, 1)));
-#endif // HAVE_NEON
+// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
+// needs to be updated.
+// #if HAVE_SSE2
+// HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = {
+// HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+// &av1_highbd_apply_temporal_filter_sse2)
+//};
+// INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
+// Combine(ValuesIn(HBDtemporal_filter_test_sse2),
+// Values(0, 1)));
+// #endif // HAVE_SSE2
+// #if HAVE_AVX2
+// HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
+// HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+// &av1_highbd_apply_temporal_filter_avx2)
+//};
+// INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
+// Combine(ValuesIn(HBDtemporal_filter_test_avx2),
+// Values(0, 1)));
+// #endif // HAVE_AVX2
+
+// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
+// needs to be updated.
+// #if HAVE_NEON
+// HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
+// HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+// &av1_highbd_apply_temporal_filter_neon)
+//};
+// INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
+// Combine(ValuesIn(HBDtemporal_filter_test_neon),
+// Values(0, 1)));
+// #endif // HAVE_NEON
using HBDEstimateNoiseFunc = double (*)(const uint16_t *src, int height,
int width, int stride, int bit_depth,