Commit 73741777a5 for aom

commit 73741777a5431c63484c7f7cb5f4eb95fa63f212
Author: Diksha Singh <diksha.singh@ittiam.com>
Date:   Mon Mar 9 22:43:11 2026 +0530

    Enable AVX2 and SSE2 for av1_apply_temporal_filter()

    The AVX2 and SSE2 implementations are modified for TF_BLOCK_SIZE of 64x64.
    Scaling w.r.t C are as follows:

           tf_wgt_calc_lvl=0    tf_wgt_calc_lvl=1
    AVX2        7.9x                 14.9x
    SSE2        6.1x                  7.8x

    Change-Id: I34cb2b9625f7bc2847c1bd35a562120224a94d8f

diff --git a/av1/encoder/arm/temporal_filter_neon.c b/av1/encoder/arm/temporal_filter_neon.c
index 132045f24c..3f62af6659 100644
--- a/av1/encoder/arm/temporal_filter_neon.c
+++ b/av1/encoder/arm/temporal_filter_neon.c
@@ -192,6 +192,8 @@ static void apply_temporal_filter(
   }
 }

+// TODO: bug aomedia:493082083 - Modify this function to support TF_BLOCK_SIZE
+// of 64x64.
 void av1_apply_temporal_filter_neon(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -199,6 +201,13 @@ void av1_apply_temporal_filter_neon(
     const int *subblock_mses, const int q_factor, const int filter_strength,
     int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
     uint16_t *count) {
+  if (block_size == BLOCK_64X64) {
+    av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
+                                mb_col, num_planes, noise_levels, subblock_mvs,
+                                subblock_mses, q_factor, filter_strength,
+                                tf_wgt_calc_lvl, pred, accum, count);
+    return;
+  }
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
   assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
diff --git a/av1/encoder/arm/temporal_filter_neon_dotprod.c b/av1/encoder/arm/temporal_filter_neon_dotprod.c
index c8fd699008..fc6a1252d3 100644
--- a/av1/encoder/arm/temporal_filter_neon_dotprod.c
+++ b/av1/encoder/arm/temporal_filter_neon_dotprod.c
@@ -214,6 +214,8 @@ static void apply_temporal_filter(
   }
 }

+// TODO: bug aomedia:493082083 - Modify this function to support TF_BLOCK_SIZE
+// of 64x64.
 void av1_apply_temporal_filter_neon_dotprod(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
@@ -221,6 +223,13 @@ void av1_apply_temporal_filter_neon_dotprod(
     const int *subblock_mses, const int q_factor, const int filter_strength,
     int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
     uint16_t *count) {
+  if (block_size == BLOCK_64X64) {
+    av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row,
+                                mb_col, num_planes, noise_levels, subblock_mvs,
+                                subblock_mses, q_factor, filter_strength,
+                                tf_wgt_calc_lvl, pred, accum, count);
+    return;
+  }
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
   assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 3840d26de7..bf827f187a 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -46,9 +46,6 @@

 // NOTE: All `tf` in this file means `temporal filtering`.

-// Number of 16x16 blocks within one 64x64 TF block.
-#define NUM_16X16 16
-
 // Forward Declaration.
 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
                                          const MV *midblock_mvs,
@@ -1176,7 +1173,7 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
 #endif  // CONFIG_AV1_HIGHBITDEPTH
         } else {
           // for 8-bit
-          if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_32X32 &&
+          if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_64X64 &&
               TF_WINDOW_LENGTH == 5) {
             av1_apply_temporal_filter(
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 3f93ab03af..d0108c41b7 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -26,8 +26,8 @@ struct ThreadData;
 // TODO(wtc): These two variables are only used in avx2, sse2, neon
 // implementations, where the block size is still hard coded to TF_BLOCK_SIZE.
 // This should be fixed to align with the c implementation.
-#define BH 32
-#define BW 32
+#define BH 64
+#define BW 64

 // Block size used in temporal filtering.
 #define TF_BLOCK_SIZE BLOCK_64X64
@@ -35,6 +35,9 @@ struct ThreadData;
 // Window size for temporal filtering.
 #define TF_WINDOW_LENGTH 5

+// Number of 16x16 blocks within one 64x64 TF block.
+#define NUM_16X16 16
+
 // A constant number, sqrt(pi / 2),  used for noise estimation.
 static const double SQRT_PI_BY_2 = 1.25331413732;

diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index f3d85e123b..45d563be18 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -229,62 +229,35 @@ double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height,
   return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
 }

-static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+static AOM_FORCE_INLINE void get_squared_error_avx2(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
     uint16_t *frame_sse, const unsigned int sse_stride) {
-  (void)block_width;
   const uint8_t *src1 = frame1;
   const uint8_t *src2 = frame2;
   uint16_t *dst = frame_sse;
   for (int i = 0; i < block_height; i++) {
-    __m128i vf1_128, vf2_128;
-    __m256i vf1, vf2, vdiff1, vsqdiff1;
-
-    vf1_128 = _mm_loadu_si128((__m128i *)(src1));
-    vf2_128 = _mm_loadu_si128((__m128i *)(src2));
-    vf1 = _mm256_cvtepu8_epi16(vf1_128);
-    vf2 = _mm256_cvtepu8_epi16(vf2_128);
-    vdiff1 = _mm256_sub_epi16(vf1, vf2);
-    vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
-
-    _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
-    // Set zero to uninitialized memory to avoid uninitialized loads later
-    *(int *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
-
-    src1 += stride, src2 += stride2;
-    dst += sse_stride;
-  }
-}
-
-static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
-    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
-    const unsigned int stride2, const int block_width, const int block_height,
-    uint16_t *frame_sse, const unsigned int sse_stride) {
-  (void)block_width;
-  const uint8_t *src1 = frame1;
-  const uint8_t *src2 = frame2;
-  uint16_t *dst = frame_sse;
-  for (int i = 0; i < block_height; i++) {
-    __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
-
-    vsrc1 = _mm256_loadu_si256((__m256i *)src1);
-    vsrc2 = _mm256_loadu_si256((__m256i *)src2);
-    vmax = _mm256_max_epu8(vsrc1, vsrc2);
-    vmin = _mm256_min_epu8(vsrc1, vsrc2);
-    vdiff = _mm256_subs_epu8(vmax, vmin);
-
-    __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
-    __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
-    vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
-    vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
-
-    vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
-    vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
-    _mm256_storeu_si256((__m256i *)(dst), vres1);
-    _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
+    for (int j = 0; j < block_width; j += 32) {
+      __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
+
+      vsrc1 = _mm256_loadu_si256((__m256i *)(src1 + j));
+      vsrc2 = _mm256_loadu_si256((__m256i *)(src2 + j));
+      vmax = _mm256_max_epu8(vsrc1, vsrc2);
+      vmin = _mm256_min_epu8(vsrc1, vsrc2);
+      vdiff = _mm256_subs_epu8(vmax, vmin);
+
+      __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
+      __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
+      vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
+      vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
+
+      vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+      vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
+      _mm256_storeu_si256((__m256i *)(dst + j), vres1);
+      _mm256_storeu_si256((__m256i *)(dst + 16 + j), vres2);
+    }
     // Set zero to uninitialized memory to avoid uninitialized loads later
-    *(int *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
+    *(int *)(dst + block_width) = _mm_cvtsi128_si32(_mm_setzero_si128());

     src1 += stride;
     src2 += stride2;
@@ -351,18 +324,13 @@ static void apply_temporal_filter(
     const double inv_num_ref_pixels, const double decay_factor,
     const double inv_factor, const double weight_factor, double *d_factor,
     int tf_wgt_calc_lvl) {
-  assert(((block_width == 16) || (block_width == 32)) &&
-         ((block_height == 16) || (block_height == 32)));
+  assert(((block_width == 64) || (block_width == 32)) &&
+         ((block_height == 64) || (block_height == 32)));

   uint32_t acc_5x5_sse[BH][BW];

-  if (block_width == 32) {
-    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
-                                 block_height, frame_sse, SSE_STRIDE);
-  } else {
-    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
-                                 block_height, frame_sse, SSE_STRIDE);
-  }
+  get_squared_error_avx2(frame1, stride, frame2, stride2, block_width,
+                         block_height, frame_sse, SSE_STRIDE);

   __m256i vsrc[5];

@@ -409,21 +377,28 @@ static void apply_temporal_filter(
     }
   }

-  double subblock_mses_scaled[4];
-  double d_factor_decayed[4];
-  for (int idx = 0; idx < 4; idx++) {
+  double subblock_mses_scaled[NUM_16X16];
+  double d_factor_decayed[NUM_16X16];
+  for (int idx = 0; idx < NUM_16X16; idx++) {
     subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
     d_factor_decayed[idx] = d_factor[idx] * decay_factor;
   }
   if (tf_wgt_calc_lvl == 0) {
     for (int i = 0, k = 0; i < block_height; i++) {
-      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+      const int y16_blk_raster_offset =
+          ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
       for (int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame2[i * stride2 + j];
         uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];

         const double window_error = diff_sse * inv_num_ref_pixels;
-        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+        const int x32_blk_raster_offset = (j >= (block_width >> 1));
+        const int x16_blk_raster_offset =
+            ((j % (block_width >> 1)) >= (block_width >> 2));
+        const int subblock_idx =
+            ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+            (y16_blk_raster_offset + x16_blk_raster_offset);
         const double combined_error =
             weight_factor * window_error + subblock_mses_scaled[subblock_idx];

@@ -436,8 +411,8 @@ static void apply_temporal_filter(
       }
     }
   } else {
-    __m256d subblock_mses_reg[4];
-    __m256d d_factor_mul_n_decay_qr_invs[4];
+    __m256d subblock_mses_reg[NUM_16X16];
+    __m256d d_factor_mul_n_decay_qr_invs[NUM_16X16];
     const __m256 zero = _mm256_set1_ps(0.0f);
     const __m256 point_five = _mm256_set1_ps(0.5f);
     const __m256 seven = _mm256_set1_ps(7.0f);
@@ -445,17 +420,15 @@ static void apply_temporal_filter(
     const __m256d weight_factor_256bit = _mm256_set1_pd(weight_factor);
     const __m256 tf_weight_scale = _mm256_set1_ps((float)TF_WEIGHT_SCALE);
     // Maintain registers to hold mse and d_factor at subblock level.
-    subblock_mses_reg[0] = _mm256_set1_pd(subblock_mses_scaled[0]);
-    subblock_mses_reg[1] = _mm256_set1_pd(subblock_mses_scaled[1]);
-    subblock_mses_reg[2] = _mm256_set1_pd(subblock_mses_scaled[2]);
-    subblock_mses_reg[3] = _mm256_set1_pd(subblock_mses_scaled[3]);
-    d_factor_mul_n_decay_qr_invs[0] = _mm256_set1_pd(d_factor_decayed[0]);
-    d_factor_mul_n_decay_qr_invs[1] = _mm256_set1_pd(d_factor_decayed[1]);
-    d_factor_mul_n_decay_qr_invs[2] = _mm256_set1_pd(d_factor_decayed[2]);
-    d_factor_mul_n_decay_qr_invs[3] = _mm256_set1_pd(d_factor_decayed[3]);
+    for (int i = 0; i < NUM_16X16; i++) {
+      subblock_mses_reg[i] = _mm256_set1_pd(subblock_mses_scaled[i]);
+      d_factor_mul_n_decay_qr_invs[i] = _mm256_set1_pd(d_factor_decayed[i]);
+    }

     for (int i = 0; i < block_height; i++) {
-      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+      const int y16_blk_raster_offset =
+          ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
       uint32_t *luma_sse_sum_temp = luma_sse_sum + i * BW;
       for (int j = 0; j < block_width; j += 8) {
         const __m256i acc_sse =
@@ -477,9 +450,13 @@ static void apply_temporal_filter(
         const __m256d window_error_2 =
             _mm256_mul_pd(diff_sse_pd_2, inv_num_ref_pixel_256bit);

-        // const int subblock_idx = y_blk_raster_offset + (j >= block_width /
-        // 2);
-        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+        // const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
+        const int x32_blk_raster_offset = (j >= (block_width >> 1));
+        const int x16_blk_raster_offset =
+            ((j % (block_width >> 1)) >= (block_width >> 2));
+        const int subblock_idx =
+            ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+            (y16_blk_raster_offset + x16_blk_raster_offset);
         const __m256d blk_error = subblock_mses_reg[subblock_idx];

         // const double combined_error =
@@ -555,7 +532,7 @@ void av1_apply_temporal_filter_avx2(
     int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
     uint16_t *count) {
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
-  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
+  assert(block_size == BLOCK_64X64 && "Only support 64x64 block with avx2!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
   assert(!is_high_bitdepth && "Only support low bit-depth with avx2!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -584,16 +561,20 @@ void av1_apply_temporal_filter_avx2(
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
-  double d_factor[4] = { 0 };
-  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
-  uint32_t luma_sse_sum[BW * BH] = { 0 };
-
-  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+  double d_factor[NUM_16X16] = { 0 };
+  uint16_t *frame_sse =
+      (uint16_t *)aom_memalign(32, sizeof(frame_sse[0]) * SSE_STRIDE * BH);
+  uint32_t *luma_sse_sum =
+      (uint32_t *)aom_memalign(32, sizeof(luma_sse_sum[0]) * BW * BH);
+  memset(frame_sse, 0, sizeof(frame_sse[0]) * SSE_STRIDE * BH);
+  memset(luma_sse_sum, 0, sizeof(luma_sse_sum[0]) * BW * BH);
+
+  double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+  distance_threshold = AOMMAX(distance_threshold, 1);
+  for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
     // Larger motion vector -> smaller filtering weight.
     const MV mv = subblock_mvs[subblock_idx];
     const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
-    distance_threshold = AOMMAX(distance_threshold, 1);
     d_factor[subblock_idx] = distance / distance_threshold;
     d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
   }
@@ -624,13 +605,35 @@ void av1_apply_temporal_filter_avx2(
     // will be more accurate. The luma sse sum is reused in both chroma
     // planes.
     if (plane == AOM_PLANE_U) {
-      for (unsigned int i = 0, k = 0; i < plane_h; i++) {
-        for (unsigned int j = 0; j < plane_w; j++, k++) {
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+      if (ss_x_shift == 1 && ss_y_shift == 1) {
+        const __m256i zero_reg = _mm256_setzero_si256();
+        for (unsigned int i = 0; i < plane_h; i++) {
+          const uint16_t *src_0 = &frame_sse[2 * i * SSE_STRIDE];
+          const uint16_t *src_1 = &frame_sse[(2 * i + 1) * SSE_STRIDE];
+          for (unsigned int j = 0; j < plane_w; j += 8) {
+            const __m256i reg0 = _mm256_loadu_si256((__m256i *)(src_0 + j * 2));
+            const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src_1 + j * 2));
+
+            const __m256i reg0_lo = _mm256_unpacklo_epi16(reg0, zero_reg);
+            const __m256i reg0_hi = _mm256_unpackhi_epi16(reg0, zero_reg);
+            const __m256i reg1_lo = _mm256_unpacklo_epi16(reg1, zero_reg);
+            const __m256i reg1_hi = _mm256_unpackhi_epi16(reg1, zero_reg);
+
+            const __m256i reg_0 = _mm256_add_epi32(reg0_lo, reg1_lo);
+            const __m256i reg_1 = _mm256_add_epi32(reg0_hi, reg1_hi);
+            const __m256i res = _mm256_hadd_epi32(reg_0, reg_1);
+            _mm256_storeu_si256((__m256i *)&luma_sse_sum[i * BW + j], res);
+          }
+        }
+      } else {
+        for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+          for (unsigned int j = 0; j < plane_w; j++, k++) {
+            for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+              for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+                const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+                const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+                luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+              }
             }
           }
         }
@@ -644,4 +647,6 @@ void av1_apply_temporal_filter_avx2(
                           weight_factor, d_factor, tf_wgt_calc_lvl);
     plane_offset += plane_h * plane_w;
   }
+  aom_free(frame_sse);
+  aom_free(luma_sse_sum);
 }
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 8a8c94719e..7fcea7e450 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -110,8 +110,8 @@ static void apply_temporal_filter(
     const double inv_num_ref_pixels, const double decay_factor,
     const double inv_factor, const double weight_factor, double *d_factor,
     int tf_wgt_calc_lvl) {
-  assert(((block_width == 16) || (block_width == 32)) &&
-         ((block_height == 16) || (block_height == 32)));
+  assert(((block_width == 64) || (block_width == 32)) &&
+         ((block_height == 64) || (block_height == 32)));

   uint32_t acc_5x5_sse[BH][BW];

@@ -170,21 +170,28 @@ static void apply_temporal_filter(
     }
   }

-  double subblock_mses_scaled[4];
-  double d_factor_decayed[4];
-  for (int idx = 0; idx < 4; idx++) {
+  double subblock_mses_scaled[NUM_16X16];
+  double d_factor_decayed[NUM_16X16];
+  for (int idx = 0; idx < NUM_16X16; idx++) {
     subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
     d_factor_decayed[idx] = d_factor[idx] * decay_factor;
   }
   if (tf_wgt_calc_lvl == 0) {
     for (int i = 0, k = 0; i < block_height; i++) {
-      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+      const int y16_blk_raster_offset =
+          ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
       for (int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame2[i * stride2 + j];
         uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];

         const double window_error = diff_sse * inv_num_ref_pixels;
-        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+        const int x32_blk_raster_offset = (j >= (block_width >> 1));
+        const int x16_blk_raster_offset =
+            ((j % (block_width >> 1)) >= (block_width >> 2));
+        const int subblock_idx =
+            ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+            (y16_blk_raster_offset + x16_blk_raster_offset);
         const double combined_error =
             weight_factor * window_error + subblock_mses_scaled[subblock_idx];

@@ -198,13 +205,20 @@ static void apply_temporal_filter(
     }
   } else {
     for (int i = 0, k = 0; i < block_height; i++) {
-      const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+      const int y32_blk_raster_offset = (i >= (block_height >> 1)) << 1;
+      const int y16_blk_raster_offset =
+          ((i % (block_height >> 1)) >= (block_height >> 2)) << 1;
       for (int j = 0; j < block_width; j++, k++) {
         const int pixel_value = frame2[i * stride2 + j];
         uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];

         const double window_error = diff_sse * inv_num_ref_pixels;
-        const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+        const int x32_blk_raster_offset = (j >= (block_width >> 1));
+        const int x16_blk_raster_offset =
+            ((j % (block_width >> 1)) >= (block_width >> 2));
+        const int subblock_idx =
+            ((y32_blk_raster_offset + x32_blk_raster_offset) << 2) +
+            (y16_blk_raster_offset + x16_blk_raster_offset);
         const double combined_error =
             weight_factor * window_error + subblock_mses_scaled[subblock_idx];

@@ -228,7 +242,7 @@ void av1_apply_temporal_filter_sse2(
     int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
     uint16_t *count) {
   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
-  assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+  assert(block_size == BLOCK_64X64 && "Only support 64x64 block with sse2!");
   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
   assert(!is_high_bitdepth && "Only support low bit-depth with sse2!");
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
@@ -257,16 +271,20 @@ void av1_apply_temporal_filter_sse2(
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
-  double d_factor[4] = { 0 };
-  uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
-  uint32_t luma_sse_sum[BW * BH] = { 0 };
-
-  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+  double d_factor[NUM_16X16] = { 0 };
+  uint16_t *frame_sse =
+      (uint16_t *)aom_memalign(32, sizeof(frame_sse[0]) * SSE_STRIDE * BH);
+  uint32_t *luma_sse_sum =
+      (uint32_t *)aom_memalign(32, sizeof(luma_sse_sum[0]) * BW * BH);
+  memset(frame_sse, 0, sizeof(frame_sse[0]) * SSE_STRIDE * BH);
+  memset(luma_sse_sum, 0, sizeof(luma_sse_sum[0]) * BW * BH);
+
+  double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+  distance_threshold = AOMMAX(distance_threshold, 1);
+  for (int subblock_idx = 0; subblock_idx < NUM_16X16; subblock_idx++) {
     // Larger motion vector -> smaller filtering weight.
     const MV mv = subblock_mvs[subblock_idx];
     const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
-    distance_threshold = AOMMAX(distance_threshold, 1);
     d_factor[subblock_idx] = distance / distance_threshold;
     d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
   }
@@ -317,4 +335,6 @@ void av1_apply_temporal_filter_sse2(
                           weight_factor, d_factor, tf_wgt_calc_lvl);
     plane_offset += plane_h * plane_w;
   }
+  aom_free(frame_sse);
+  aom_free(luma_sse_sum);
 }
diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 8d9afaa99c..c9b4224adb 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc
@@ -134,6 +134,7 @@ void TemporalFilterTest::RunTest(int isRandom, int run_times,
   static_assert(block_size == BLOCK_64X64, "");
   const int width = 64;
   const int height = 64;
+  const int pels = width * height;
   int num_planes = MAX_MB_PLANE;
   int subsampling_x = 0;
   int subsampling_y = 0;
@@ -165,18 +166,23 @@ void TemporalFilterTest::RunTest(int isRandom, int run_times,
     }
     double sigma[MAX_MB_PLANE] = { 2.1002103677063437, 2.1002103677063437,
                                    2.1002103677063437 };
-    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
-    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
-    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
-    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
-    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
-    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
-    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
-    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[pels * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_ref[pels * 3]);
+    memset(accumulator_ref, 0, pels * 3 * sizeof(accumulator_ref[0]));
+    memset(count_ref, 0, pels * 3 * sizeof(count_ref[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[pels * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_mod[pels * 3]);
+    memset(accumulator_mod, 0, pels * 3 * sizeof(accumulator_mod[0]));
+    memset(count_mod, 0, pels * 3 * sizeof(count_mod[0]));

     static_assert(width == 64 && height == 64, "");
-    const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
-    const int subblock_mses[4] = { 15, 16, 17, 18 };
+    const MV subblock_mvs[NUM_16X16] = {
+      { 0, 0 }, { 5, 5 },  { 7, 8 }, { 2, 10 }, { 0, 0 }, { 5, 5 },
+      { 7, 8 }, { 2, 10 }, { 0, 0 }, { 5, 5 },  { 7, 8 }, { 2, 10 },
+      { 0, 0 }, { 5, 5 },  { 7, 8 }, { 2, 10 }
+    };
+    const int subblock_mses[NUM_16X16] = { 15, 16, 17, 18, 15, 16, 17, 18,
+                                           15, 16, 17, 18, 15, 16, 17, 18 };
     const int q_factor = 12;
     const int filter_strength = 5;
     const int mb_row = 0;
@@ -190,10 +196,10 @@ void TemporalFilterTest::RunTest(int isRandom, int run_times,
     frame_to_filter->heights[PLANE_TYPE_UV] = height >> subsampling_y;
     frame_to_filter->strides[PLANE_TYPE_Y] = stride;
     frame_to_filter->strides[PLANE_TYPE_UV] = stride >> subsampling_x;
-    DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
+    DECLARE_ALIGNED(16, uint8_t, src[pels * 3]);
     frame_to_filter->buffer_alloc = src;
     frame_to_filter->flags = 0;  // Only support low bit-depth test.
-    memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
+    memcpy(src, src1_, pels * 3 * sizeof(uint8_t));

     std::unique_ptr<MACROBLOCKD> mbd(new (std::nothrow) MACROBLOCKD);
     ASSERT_NE(mbd, nullptr);
@@ -284,25 +290,21 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
   RunTest(1, 100000, I444);
 }

-// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
-// needs to be updated.
-// #if HAVE_AVX2
-// TemporalFilterFuncParam temporal_filter_test_avx2[] = {
-// TemporalFilterFuncParam(
-//    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
-// INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
-//                         Combine(ValuesIn(temporal_filter_test_avx2),
-//                                 Values(0, 1)));
-// #endif  // HAVE_AVX2
-//
-// #if HAVE_SSE2
-// TemporalFilterFuncParam temporal_filter_test_sse2[] = {
-// TemporalFilterFuncParam(
-//    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
-// INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
-//                         Combine(ValuesIn(temporal_filter_test_sse2),
-//                                 Values(0, 1)));
-// #endif  // HAVE_SSE2
+#if HAVE_AVX2
+TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_avx2),
+                                 Values(0, 1)));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSE2
+TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_sse2),
+                                 Values(0, 1)));
+#endif  // HAVE_SSE2

 // av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
 // needs to be updated.