Commit cb0b87d6c2 for aom

commit cb0b87d6c2dda00b534149b0f27559bf8fa6eecb
Author: Jeremy Dorfman <jdorfman@google.com>
Date:   Wed May 13 08:28:54 2026 -0400

    [variance] Optimize AVX2 subpel bilinear filtering

    This change replaces row-by-row execution with explicit pipelining in
    the X and Y bilinear filtering path, improving performance by around 20%
    in aom_sub_pixel_varianceWxH_avx2. All of the improvement here is from
    better instruction-level parallelism by interleaving loading and
    filtering.

    This change doesn't use or change the existing macros; I will openly
    admit it was hard for me to keep track of all of them, and so this
    change avoids them entirely.

    Width/Height
               Baseline CPU  New CPU      Improvement
    128/128    1.386µ ± 0%   1.085µ ± 0%  -21.73% (p=0.000 n=20)
    128/64     675.6n ± 0%   525.9n ± 0%  -22.17% (p=0.000 n=20)
    64/128     677.1n ± 0%   525.7n ± 0%  -22.36% (p=0.000 n=20)
    64/64      337.5n ± 0%   261.5n ± 1%  -22.54% (p=0.000 n=20)
    64/32      174.9n ± 0%   136.4n ± 0%  -22.02% (p=0.000 n=20)
    32/64      167.4n ± 1%   130.7n ± 0%  -21.89% (p=0.000 n=20)
    32/32      87.09n ± 0%   68.62n ± 0%  -21.21% (p=0.000 n=20)
    32/16      48.31n ± 1%   37.66n ± 0%  -22.05% (p=0.000 n=20)
    16/32      64.98n ± 0%   51.62n ± 0%  -20.57% (p=0.000 n=20)
    16/16      34.84n ± 0%   28.02n ± 0%  -19.58% (p=0.000 n=20)
    16/8       20.06n ± 0%   17.45n ± 0%  -12.99% (p=0.000 n=20)
    16/64      123.83n ± 0%  97.84n ± 1%  -20.98% (p=0.000 n=20)
    16/4       12.53n ± 6%   10.64n ± 2%  -15.05% (p=0.000 n=20)

    Change-Id: Ib602d487c9a29c5216f33f293890a1e621dd971d

diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index d402697501..93439e5d5f 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c
@@ -167,200 +167,348 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
 // don't have to manually optimize the individual for-loops. We could save some
 // binary size by optimizing the loops more carefully without duplicating the
 // codes with a macro.
-#define MAKE_SUB_PIXEL_VAR_32XH(height, log2height)                           \
-  static inline int aom_sub_pixel_variance32x##height##_imp_avx2(             \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
-    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
-    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
-    __m256i zero_reg;                                                         \
-    int i, sum;                                                               \
-    sum_reg = _mm256_setzero_si256();                                         \
-    sse_reg = _mm256_setzero_si256();                                         \
-    zero_reg = _mm256_setzero_si256();                                        \
-                                                                              \
-    /* x_offset = 0 and y_offset = 0 */                                       \
-    if (x_offset == 0) {                                                      \
-      if (y_offset == 0) {                                                    \
-        for (i = 0; i < height; i++) {                                        \
-          LOAD_SRC_DST                                                        \
-          /* expend each byte to 2 bytes */                                   \
-          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += src_stride;                                                  \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-        /* x_offset = 0 and y_offset = 4 */                                   \
-      } else if (y_offset == 4) {                                             \
-        __m256i src_next_reg;                                                 \
-        for (i = 0; i < height; i++) {                                        \
-          LOAD_SRC_DST                                                        \
-          AVG_NEXT_SRC(src_reg, src_stride)                                   \
-          /* expend each byte to 2 bytes */                                   \
-          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += src_stride;                                                  \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-        /* x_offset = 0 and y_offset = bilin interpolation */                 \
-      } else {                                                                \
-        __m256i filter, pw8, src_next_reg;                                    \
-                                                                              \
-        y_offset <<= 5;                                                       \
-        filter = _mm256_load_si256(                                           \
-            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        for (i = 0; i < height; i++) {                                        \
-          LOAD_SRC_DST                                                        \
-          MERGE_NEXT_SRC(src_reg, src_stride)                                 \
-          FILTER_SRC(filter)                                                  \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += src_stride;                                                  \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-      }                                                                       \
-      /* x_offset = 4  and y_offset = 0 */                                    \
-    } else if (x_offset == 4) {                                               \
-      if (y_offset == 0) {                                                    \
-        __m256i src_next_reg;                                                 \
-        for (i = 0; i < height; i++) {                                        \
-          LOAD_SRC_DST                                                        \
-          AVG_NEXT_SRC(src_reg, 1)                                            \
-          /* expand each byte to 2 bytes */                                   \
-          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += src_stride;                                                  \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-        /* x_offset = 4  and y_offset = 4 */                                  \
-      } else if (y_offset == 4) {                                             \
-        __m256i src_next_reg, src_avg;                                        \
-        /* load source and another source starting from the next */           \
-        /* following byte */                                                  \
-        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
-        AVG_NEXT_SRC(src_reg, 1)                                              \
-        for (i = 0; i < height; i++) {                                        \
-          src_avg = src_reg;                                                  \
-          src += src_stride;                                                  \
-          LOAD_SRC_DST                                                        \
-          AVG_NEXT_SRC(src_reg, 1)                                            \
-          /* average between previous average to current average */           \
-          src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
-          /* expand each byte to 2 bytes */                                   \
-          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
-          /* save current source average */                                   \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-        /* x_offset = 4  and y_offset = bilin interpolation */                \
-      } else {                                                                \
-        __m256i filter, pw8, src_next_reg, src_avg;                           \
-        y_offset <<= 5;                                                       \
-        filter = _mm256_load_si256(                                           \
-            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        /* load source and another source starting from the next */           \
-        /* following byte */                                                  \
-        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
-        AVG_NEXT_SRC(src_reg, 1)                                              \
-        for (i = 0; i < height; i++) {                                        \
-          /* save current source average */                                   \
-          src_avg = src_reg;                                                  \
-          src += src_stride;                                                  \
-          LOAD_SRC_DST                                                        \
-          AVG_NEXT_SRC(src_reg, 1)                                            \
-          MERGE_WITH_SRC(src_avg, src_reg)                                    \
-          FILTER_SRC(filter)                                                  \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-      }                                                                       \
-      /* x_offset = bilin interpolation and y_offset = 0 */                   \
-    } else {                                                                  \
-      if (y_offset == 0) {                                                    \
-        __m256i filter, pw8, src_next_reg;                                    \
-        x_offset <<= 5;                                                       \
-        filter = _mm256_load_si256(                                           \
-            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        for (i = 0; i < height; i++) {                                        \
-          LOAD_SRC_DST                                                        \
-          MERGE_NEXT_SRC(src_reg, 1)                                          \
-          FILTER_SRC(filter)                                                  \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += src_stride;                                                  \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-        /* x_offset = bilin interpolation and y_offset = 4 */                 \
-      } else if (y_offset == 4) {                                             \
-        __m256i filter, pw8, src_next_reg, src_pack;                          \
-        x_offset <<= 5;                                                       \
-        filter = _mm256_load_si256(                                           \
-            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
-        MERGE_NEXT_SRC(src_reg, 1)                                            \
-        FILTER_SRC(filter)                                                    \
-        /* convert each 16 bit to 8 bit to each low and high lane source */   \
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
-        for (i = 0; i < height; i++) {                                        \
-          src += src_stride;                                                  \
-          LOAD_SRC_DST                                                        \
-          MERGE_NEXT_SRC(src_reg, 1)                                          \
-          FILTER_SRC(filter)                                                  \
-          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
-          /* average between previous pack to the current */                  \
-          src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
-          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src_pack = src_reg;                                                 \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
-         */                                                                   \
-      } else {                                                                \
-        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
-        x_offset <<= 5;                                                       \
-        xfilter = _mm256_load_si256(                                          \
-            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
-        y_offset <<= 5;                                                       \
-        yfilter = _mm256_load_si256(                                          \
-            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        /* load source and another source starting from the next */           \
-        /* following byte */                                                  \
-        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
-        MERGE_NEXT_SRC(src_reg, 1)                                            \
-                                                                              \
-        FILTER_SRC(xfilter)                                                   \
-        /* convert each 16 bit to 8 bit to each low and high lane source */   \
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
-        for (i = 0; i < height; i++) {                                        \
-          src += src_stride;                                                  \
-          LOAD_SRC_DST                                                        \
-          MERGE_NEXT_SRC(src_reg, 1)                                          \
-          FILTER_SRC(xfilter)                                                 \
-          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
-          /* merge previous pack to current pack source */                    \
-          MERGE_WITH_SRC(src_pack, src_reg)                                   \
-          /* filter the source */                                             \
-          FILTER_SRC(yfilter)                                                 \
-          src_pack = src_reg;                                                 \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          dst += dst_stride;                                                  \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    CALC_SUM_AND_SSE                                                          \
-    _mm256_zeroupper();                                                       \
-    return sum;                                                               \
-  }                                                                           \
-  unsigned int aom_sub_pixel_variance32x##height##_avx2(                      \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
-    const int sum = aom_sub_pixel_variance32x##height##_imp_avx2(             \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sse);           \
-    return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
+#define MAKE_SUB_PIXEL_VAR_32XH(height, log2height)                            \
+  static inline int aom_sub_pixel_variance32x##height##_imp_avx2(              \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                 \
+    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;  \
+    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;     \
+    __m256i zero_reg;                                                          \
+    int i, sum;                                                                \
+    sum_reg = _mm256_setzero_si256();                                          \
+    sse_reg = _mm256_setzero_si256();                                          \
+    zero_reg = _mm256_setzero_si256();                                         \
+                                                                               \
+    /* x_offset = 0 and y_offset = 0 */                                        \
+    if (x_offset == 0) {                                                       \
+      if (y_offset == 0) {                                                     \
+        for (i = 0; i < height; i++) {                                         \
+          LOAD_SRC_DST                                                         \
+          /* expend each byte to 2 bytes */                                    \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                    \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += src_stride;                                                   \
+          dst += dst_stride;                                                   \
+        }                                                                      \
+        /* x_offset = 0 and y_offset = 4 */                                    \
+      } else if (y_offset == 4) {                                              \
+        __m256i src_next_reg;                                                  \
+        for (i = 0; i < height; i++) {                                         \
+          LOAD_SRC_DST                                                         \
+          AVG_NEXT_SRC(src_reg, src_stride)                                    \
+          /* expend each byte to 2 bytes */                                    \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                    \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += src_stride;                                                   \
+          dst += dst_stride;                                                   \
+        }                                                                      \
+        /* x_offset = 0 and y_offset = bilin interpolation */                  \
+      } else {                                                                 \
+        __m256i filter, pw8, src_next_reg;                                     \
+                                                                               \
+        y_offset <<= 5;                                                        \
+        filter = _mm256_load_si256(                                            \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        for (i = 0; i < height; i++) {                                         \
+          LOAD_SRC_DST                                                         \
+          MERGE_NEXT_SRC(src_reg, src_stride)                                  \
+          FILTER_SRC(filter)                                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += src_stride;                                                   \
+          dst += dst_stride;                                                   \
+        }                                                                      \
+      }                                                                        \
+      /* x_offset = 4  and y_offset = 0 */                                     \
+    } else if (x_offset == 4) {                                                \
+      if (y_offset == 0) {                                                     \
+        __m256i src_next_reg;                                                  \
+        for (i = 0; i < height; i++) {                                         \
+          LOAD_SRC_DST                                                         \
+          AVG_NEXT_SRC(src_reg, 1)                                             \
+          /* expand each byte to 2 bytes */                                    \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                    \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += src_stride;                                                   \
+          dst += dst_stride;                                                   \
+        }                                                                      \
+        /* x_offset = 4  and y_offset = 4 */                                   \
+      } else if (y_offset == 4) {                                              \
+        __m256i src_next_reg, src_avg;                                         \
+        /* load source and another source starting from the next */            \
+        /* following byte */                                                   \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                  \
+        AVG_NEXT_SRC(src_reg, 1)                                               \
+        for (i = 0; i < height; i++) {                                         \
+          src_avg = src_reg;                                                   \
+          src += src_stride;                                                   \
+          LOAD_SRC_DST                                                         \
+          AVG_NEXT_SRC(src_reg, 1)                                             \
+          /* average between previous average to current average */            \
+          src_avg = _mm256_avg_epu8(src_avg, src_reg);                         \
+          /* expand each byte to 2 bytes */                                    \
+          MERGE_WITH_SRC(src_avg, zero_reg)                                    \
+          /* save current source average */                                    \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          dst += dst_stride;                                                   \
+        }                                                                      \
+        /* x_offset = 4  and y_offset = bilin interpolation */                 \
+      } else {                                                                 \
+        __m256i filter, pw8, src_next_reg, src_avg;                            \
+        y_offset <<= 5;                                                        \
+        filter = _mm256_load_si256(                                            \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        /* load source and another source starting from the next */            \
+        /* following byte */                                                   \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                  \
+        AVG_NEXT_SRC(src_reg, 1)                                               \
+        for (i = 0; i < height; i++) {                                         \
+          /* save current source average */                                    \
+          src_avg = src_reg;                                                   \
+          src += src_stride;                                                   \
+          LOAD_SRC_DST                                                         \
+          AVG_NEXT_SRC(src_reg, 1)                                             \
+          MERGE_WITH_SRC(src_avg, src_reg)                                     \
+          FILTER_SRC(filter)                                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          dst += dst_stride;                                                   \
+        }                                                                      \
+      }                                                                        \
+      /* x_offset = bilin interpolation and y_offset = 0 */                    \
+    } else {                                                                   \
+      if (y_offset == 0) {                                                     \
+        __m256i filter, pw8, src_next_reg;                                     \
+        x_offset <<= 5;                                                        \
+        filter = _mm256_load_si256(                                            \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        for (i = 0; i < height; i++) {                                         \
+          LOAD_SRC_DST                                                         \
+          MERGE_NEXT_SRC(src_reg, 1)                                           \
+          FILTER_SRC(filter)                                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += src_stride;                                                   \
+          dst += dst_stride;                                                   \
+        }                                                                      \
+        /* x_offset = bilin interpolation and y_offset = 4 */                  \
+      } else if (y_offset == 4) {                                              \
+        __m256i filter, pw8, src_next_reg, src_pack;                           \
+        x_offset <<= 5;                                                        \
+        filter = _mm256_load_si256(                                            \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                  \
+        MERGE_NEXT_SRC(src_reg, 1)                                             \
+        FILTER_SRC(filter)                                                     \
+        /* convert each 16 bit to 8 bit to each low and high lane source */    \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);                \
+        for (i = 0; i < height; i++) {                                         \
+          src += src_stride;                                                   \
+          LOAD_SRC_DST                                                         \
+          MERGE_NEXT_SRC(src_reg, 1)                                           \
+          FILTER_SRC(filter)                                                   \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+          /* average between previous pack to the current */                   \
+          src_pack = _mm256_avg_epu8(src_pack, src_reg);                       \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src_pack = src_reg;                                                  \
+          dst += dst_stride;                                                   \
+        }                                                                      \
+        /* x_offset = bilin interpolation and y_offset = bilin interpolation   \
+         */                                                                    \
+      } else {                                                                 \
+        __m256i xfilter, yfilter, pw8, mask_00ff;                              \
+        __m256i p0, p1, p2;                                                    \
+        const uint8_t *src_ptr = src;                                          \
+        x_offset <<= 5;                                                        \
+        xfilter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));              \
+        y_offset <<= 5;                                                        \
+        yfilter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        mask_00ff = _mm256_set1_epi16(0x00ff);                                 \
+                                                                               \
+        {                                                                      \
+          __m256i s0 = _mm256_loadu_si256((__m256i const *)(src_ptr));         \
+          __m256i s1 = _mm256_loadu_si256((__m256i const *)(src_ptr + 1));     \
+          __m256i he = _mm256_maddubs_epi16(s0, xfilter);                      \
+          __m256i ho = _mm256_maddubs_epi16(s1, xfilter);                      \
+          he = _mm256_srai_epi16(_mm256_add_epi16(he, pw8), 4);                \
+          ho = _mm256_srai_epi16(_mm256_add_epi16(ho, pw8), 4);                \
+          p0 = _mm256_packus_epi16(he, ho);                                    \
+          src_ptr += src_stride;                                               \
+        }                                                                      \
+        {                                                                      \
+          __m256i s0 = _mm256_loadu_si256((__m256i const *)(src_ptr));         \
+          __m256i s1 = _mm256_loadu_si256((__m256i const *)(src_ptr + 1));     \
+          __m256i he = _mm256_maddubs_epi16(s0, xfilter);                      \
+          __m256i ho = _mm256_maddubs_epi16(s1, xfilter);                      \
+          he = _mm256_srai_epi16(_mm256_add_epi16(he, pw8), 4);                \
+          ho = _mm256_srai_epi16(_mm256_add_epi16(ho, pw8), 4);                \
+          p1 = _mm256_packus_epi16(he, ho);                                    \
+          src_ptr += src_stride;                                               \
+        }                                                                      \
+        {                                                                      \
+          __m256i s0 = _mm256_loadu_si256((__m256i const *)(src_ptr));         \
+          __m256i s1 = _mm256_loadu_si256((__m256i const *)(src_ptr + 1));     \
+          __m256i he = _mm256_maddubs_epi16(s0, xfilter);                      \
+          __m256i ho = _mm256_maddubs_epi16(s1, xfilter);                      \
+          he = _mm256_srai_epi16(_mm256_add_epi16(he, pw8), 4);                \
+          ho = _mm256_srai_epi16(_mm256_add_epi16(ho, pw8), 4);                \
+          p2 = _mm256_packus_epi16(he, ho);                                    \
+          src_ptr += src_stride;                                               \
+        }                                                                      \
+                                                                               \
+        for (i = 0; i < height - 2; i += 2) {                                  \
+          __m256i p3, p4, v_ev_A, v_od_A, dst_A, dst_A_ev, dst_A_od;           \
+          __m256i diff_A_ev, diff_A_od, v_ev_B, v_od_B, dst_B, dst_B_ev;       \
+          __m256i dst_B_od, diff_B_ev, diff_B_od, sum_comb, sse_comb;          \
+                                                                               \
+          {                                                                    \
+            __m256i s0 = _mm256_loadu_si256((__m256i const *)(src_ptr));       \
+            __m256i s1 = _mm256_loadu_si256((__m256i const *)(src_ptr + 1));   \
+            __m256i he = _mm256_maddubs_epi16(s0, xfilter);                    \
+            __m256i ho = _mm256_maddubs_epi16(s1, xfilter);                    \
+            he = _mm256_srai_epi16(_mm256_add_epi16(he, pw8), 4);              \
+            ho = _mm256_srai_epi16(_mm256_add_epi16(ho, pw8), 4);              \
+            p3 = _mm256_packus_epi16(he, ho);                                  \
+            src_ptr += src_stride;                                             \
+          }                                                                    \
+          {                                                                    \
+            __m256i s0 = _mm256_loadu_si256((__m256i const *)(src_ptr));       \
+            __m256i s1 = _mm256_loadu_si256((__m256i const *)(src_ptr + 1));   \
+            __m256i he = _mm256_maddubs_epi16(s0, xfilter);                    \
+            __m256i ho = _mm256_maddubs_epi16(s1, xfilter);                    \
+            he = _mm256_srai_epi16(_mm256_add_epi16(he, pw8), 4);              \
+            ho = _mm256_srai_epi16(_mm256_add_epi16(ho, pw8), 4);              \
+            p4 = _mm256_packus_epi16(he, ho);                                  \
+            src_ptr += src_stride;                                             \
+          }                                                                    \
+                                                                               \
+          v_ev_A =                                                             \
+              _mm256_maddubs_epi16(_mm256_unpacklo_epi8(p0, p1), yfilter);     \
+          v_od_A =                                                             \
+              _mm256_maddubs_epi16(_mm256_unpackhi_epi8(p0, p1), yfilter);     \
+          v_ev_A = _mm256_srai_epi16(_mm256_add_epi16(v_ev_A, pw8), 4);        \
+          v_od_A = _mm256_srai_epi16(_mm256_add_epi16(v_od_A, pw8), 4);        \
+                                                                               \
+          dst_A = _mm256_loadu_si256((__m256i const *)(dst));                  \
+          dst += dst_stride;                                                   \
+          dst_A_ev = _mm256_and_si256(dst_A, mask_00ff);                       \
+          dst_A_od = _mm256_srli_epi16(dst_A, 8);                              \
+          diff_A_ev = _mm256_sub_epi16(v_ev_A, dst_A_ev);                      \
+          diff_A_od = _mm256_sub_epi16(v_od_A, dst_A_od);                      \
+                                                                               \
+          v_ev_B =                                                             \
+              _mm256_maddubs_epi16(_mm256_unpacklo_epi8(p1, p2), yfilter);     \
+          v_od_B =                                                             \
+              _mm256_maddubs_epi16(_mm256_unpackhi_epi8(p1, p2), yfilter);     \
+          v_ev_B = _mm256_srai_epi16(_mm256_add_epi16(v_ev_B, pw8), 4);        \
+          v_od_B = _mm256_srai_epi16(_mm256_add_epi16(v_od_B, pw8), 4);        \
+                                                                               \
+          dst_B = _mm256_loadu_si256((__m256i const *)(dst));                  \
+          dst += dst_stride;                                                   \
+          dst_B_ev = _mm256_and_si256(dst_B, mask_00ff);                       \
+          dst_B_od = _mm256_srli_epi16(dst_B, 8);                              \
+          diff_B_ev = _mm256_sub_epi16(v_ev_B, dst_B_ev);                      \
+          diff_B_od = _mm256_sub_epi16(v_od_B, dst_B_od);                      \
+                                                                               \
+          sum_comb = _mm256_add_epi16(_mm256_add_epi16(diff_A_ev, diff_A_od),  \
+                                      _mm256_add_epi16(diff_B_ev, diff_B_od)); \
+          sum_reg = _mm256_add_epi16(sum_reg, sum_comb);                       \
+                                                                               \
+          sse_comb = _mm256_add_epi32(                                         \
+              _mm256_add_epi32(_mm256_madd_epi16(diff_A_ev, diff_A_ev),        \
+                               _mm256_madd_epi16(diff_A_od, diff_A_od)),       \
+              _mm256_add_epi32(_mm256_madd_epi16(diff_B_ev, diff_B_ev),        \
+                               _mm256_madd_epi16(diff_B_od, diff_B_od)));      \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_comb);                       \
+                                                                               \
+          p0 = p2;                                                             \
+          p1 = p3;                                                             \
+          p2 = p4;                                                             \
+        }                                                                      \
+                                                                               \
+        {                                                                      \
+          __m256i v_ev_A, v_od_A, dst_A, dst_A_ev, dst_A_od;                   \
+          __m256i diff_A_ev, diff_A_od, v_ev_B, v_od_B, dst_B, dst_B_ev;       \
+          __m256i dst_B_od, diff_B_ev, diff_B_od, sum_comb, sse_comb;          \
+                                                                               \
+          v_ev_A =                                                             \
+              _mm256_maddubs_epi16(_mm256_unpacklo_epi8(p0, p1), yfilter);     \
+          v_od_A =                                                             \
+              _mm256_maddubs_epi16(_mm256_unpackhi_epi8(p0, p1), yfilter);     \
+          v_ev_A = _mm256_srai_epi16(_mm256_add_epi16(v_ev_A, pw8), 4);        \
+          v_od_A = _mm256_srai_epi16(_mm256_add_epi16(v_od_A, pw8), 4);        \
+                                                                               \
+          dst_A = _mm256_loadu_si256((__m256i const *)(dst));                  \
+          dst += dst_stride;                                                   \
+          dst_A_ev = _mm256_and_si256(dst_A, mask_00ff);                       \
+          dst_A_od = _mm256_srli_epi16(dst_A, 8);                              \
+          diff_A_ev = _mm256_sub_epi16(v_ev_A, dst_A_ev);                      \
+          diff_A_od = _mm256_sub_epi16(v_od_A, dst_A_od);                      \
+                                                                               \
+          v_ev_B =                                                             \
+              _mm256_maddubs_epi16(_mm256_unpacklo_epi8(p1, p2), yfilter);     \
+          v_od_B =                                                             \
+              _mm256_maddubs_epi16(_mm256_unpackhi_epi8(p1, p2), yfilter);     \
+          v_ev_B = _mm256_srai_epi16(_mm256_add_epi16(v_ev_B, pw8), 4);        \
+          v_od_B = _mm256_srai_epi16(_mm256_add_epi16(v_od_B, pw8), 4);        \
+                                                                               \
+          dst_B = _mm256_loadu_si256((__m256i const *)(dst));                  \
+          dst += dst_stride;                                                   \
+          dst_B_ev = _mm256_and_si256(dst_B, mask_00ff);                       \
+          dst_B_od = _mm256_srli_epi16(dst_B, 8);                              \
+          diff_B_ev = _mm256_sub_epi16(v_ev_B, dst_B_ev);                      \
+          diff_B_od = _mm256_sub_epi16(v_od_B, dst_B_od);                      \
+                                                                               \
+          sum_comb = _mm256_add_epi16(_mm256_add_epi16(diff_A_ev, diff_A_od),  \
+                                      _mm256_add_epi16(diff_B_ev, diff_B_od)); \
+          sum_reg = _mm256_add_epi16(sum_reg, sum_comb);                       \
+                                                                               \
+          sse_comb = _mm256_add_epi32(                                         \
+              _mm256_add_epi32(_mm256_madd_epi16(diff_A_ev, diff_A_ev),        \
+                               _mm256_madd_epi16(diff_A_od, diff_A_od)),       \
+              _mm256_add_epi32(_mm256_madd_epi16(diff_B_ev, diff_B_ev),        \
+                               _mm256_madd_epi16(diff_B_od, diff_B_od)));      \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_comb);                       \
+        }                                                                      \
+                                                                               \
+        {                                                                      \
+          __m256i sse_hi, sum_hi;                                              \
+          int f_sse, f_sum;                                                    \
+          sum_reg = _mm256_madd_epi16(sum_reg, _mm256_set1_epi16(1));          \
+          sse_hi = _mm256_srli_si256(sse_reg, 8);                              \
+          sum_hi = _mm256_srli_si256(sum_reg, 8);                              \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_hi);                         \
+          sum_reg = _mm256_add_epi32(sum_reg, sum_hi);                         \
+          sse_hi = _mm256_srli_si256(sse_reg, 4);                              \
+          sum_hi = _mm256_srli_si256(sum_reg, 4);                              \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_hi);                         \
+          sum_reg = _mm256_add_epi32(sum_reg, sum_hi);                         \
+          f_sse = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +         \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1));     \
+          f_sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +         \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));     \
+          *sse = f_sse;                                                        \
+          _mm256_zeroupper();                                                  \
+          return f_sum;                                                        \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    CALC_SUM_AND_SSE                                                           \
+    _mm256_zeroupper();                                                        \
+    return sum;                                                                \
+  }                                                                            \
+  unsigned int aom_sub_pixel_variance32x##height##_avx2(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                 \
+    const int sum = aom_sub_pixel_variance32x##height##_imp_avx2(              \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sse);            \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));    \
   }

 MAKE_SUB_PIXEL_VAR_32XH(64, 6)
@@ -400,243 +548,381 @@ AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7)
 AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6)
 AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5)

-#define MAKE_SUB_PIXEL_VAR_16XH(height, log2height)                           \
-  unsigned int aom_sub_pixel_variance16x##height##_avx2(                      \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
-    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
-    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
-    __m256i zero_reg;                                                         \
-    int i, sum;                                                               \
-    sum_reg = _mm256_setzero_si256();                                         \
-    sse_reg = _mm256_setzero_si256();                                         \
-    zero_reg = _mm256_setzero_si256();                                        \
-                                                                              \
-    /* x_offset = 0 and y_offset = 0 */                                       \
-    if (x_offset == 0) {                                                      \
-      if (y_offset == 0) {                                                    \
-        for (i = 0; i < height; i += 2) {                                     \
-          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
-          /* expend each byte to 2 bytes */                                   \
-          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += (src_stride << 1);                                           \
-          dst += (dst_stride << 1);                                           \
-        }                                                                     \
-        /* x_offset = 0 and y_offset = 4 */                                   \
-      } else if (y_offset == 4) {                                             \
-        __m256i src_next_reg;                                                 \
-        for (i = 0; i < height; i += 2) {                                     \
-          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
-          AVG_NEXT_SRC_INSERT(src_reg, src_stride)                            \
-          /* expend each byte to 2 bytes */                                   \
-          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += (src_stride << 1);                                           \
-          dst += (dst_stride << 1);                                           \
-        }                                                                     \
-        /* x_offset = 0 and y_offset = bilin interpolation */                 \
-      } else {                                                                \
-        __m256i filter, pw8, src_next_reg;                                    \
-        y_offset <<= 5;                                                       \
-        filter = _mm256_load_si256(                                           \
-            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        for (i = 0; i < height; i += 2) {                                     \
-          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
-          MERGE_NEXT_SRC_INSERT(src_reg, src_stride)                          \
-          FILTER_SRC(filter)                                                  \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += (src_stride << 1);                                           \
-          dst += (dst_stride << 1);                                           \
-        }                                                                     \
-      }                                                                       \
-      /* x_offset = 4  and y_offset = 0 */                                    \
-    } else if (x_offset == 4) {                                               \
-      if (y_offset == 0) {                                                    \
-        __m256i src_next_reg;                                                 \
-        for (i = 0; i < height; i += 2) {                                     \
-          LOAD_SRC_NEXT_BYTE_INSERT                                           \
-          LOAD_DST_INSERT                                                     \
-          /* average between current and next stride source */                \
-          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);                   \
-          /* expand each byte to 2 bytes */                                   \
-          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += (src_stride << 1);                                           \
-          dst += (dst_stride << 1);                                           \
-        }                                                                     \
-        /* x_offset = 4  and y_offset = 4 */                                  \
-      } else if (y_offset == 4) {                                             \
-        __m256i src_next_reg, src_avg, src_temp;                              \
-        /* load and insert source and next row source */                      \
-        LOAD_SRC_NEXT_BYTE_INSERT                                             \
-        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
-        src += src_stride << 1;                                               \
-        for (i = 0; i < height - 2; i += 2) {                                 \
-          LOAD_SRC_NEXT_BYTE_INSERT                                           \
-          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
-          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
-          src_temp = _mm256_avg_epu8(src_avg, src_temp);                      \
-          LOAD_DST_INSERT                                                     \
-          /* expand each byte to 2 bytes */                                   \
-          MERGE_WITH_SRC(src_temp, zero_reg)                                  \
-          /* save current source average */                                   \
-          src_avg = src_next_reg;                                             \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          dst += dst_stride << 1;                                             \
-          src += src_stride << 1;                                             \
-        }                                                                     \
-        /* last 2 rows processing happens here */                             \
-        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
-        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
-        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
-        src_next_reg = _mm256_permute2x128_si256(                             \
-            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
-        LOAD_DST_INSERT                                                       \
-        src_avg = _mm256_avg_epu8(src_avg, src_next_reg);                     \
-        MERGE_WITH_SRC(src_avg, zero_reg)                                     \
-        CALC_SUM_SSE_INSIDE_LOOP                                              \
-      } else {                                                                \
-        /* x_offset = 4  and y_offset = bilin interpolation */                \
-        __m256i filter, pw8, src_next_reg, src_avg, src_temp;                 \
-        y_offset <<= 5;                                                       \
-        filter = _mm256_load_si256(                                           \
-            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        /* load and insert source and next row source */                      \
-        LOAD_SRC_NEXT_BYTE_INSERT                                             \
-        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
-        src += src_stride << 1;                                               \
-        for (i = 0; i < height - 2; i += 2) {                                 \
-          LOAD_SRC_NEXT_BYTE_INSERT                                           \
-          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
-          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
-          LOAD_DST_INSERT                                                     \
-          MERGE_WITH_SRC(src_avg, src_temp)                                   \
-          /* save current source average */                                   \
-          src_avg = src_next_reg;                                             \
-          FILTER_SRC(filter)                                                  \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          dst += dst_stride << 1;                                             \
-          src += src_stride << 1;                                             \
-        }                                                                     \
-        /* last 2 rows processing happens here */                             \
-        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
-        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
-        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
-        src_next_reg = _mm256_permute2x128_si256(                             \
-            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
-        LOAD_DST_INSERT                                                       \
-        MERGE_WITH_SRC(src_avg, src_next_reg)                                 \
-        FILTER_SRC(filter)                                                    \
-        CALC_SUM_SSE_INSIDE_LOOP                                              \
-      }                                                                       \
-      /* x_offset = bilin interpolation and y_offset = 0 */                   \
-    } else {                                                                  \
-      if (y_offset == 0) {                                                    \
-        __m256i filter, pw8, src_next_reg;                                    \
-        x_offset <<= 5;                                                       \
-        filter = _mm256_load_si256(                                           \
-            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        for (i = 0; i < height; i += 2) {                                     \
-          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
-          MERGE_NEXT_SRC_INSERT(src_reg, 1)                                   \
-          FILTER_SRC(filter)                                                  \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += (src_stride << 1);                                           \
-          dst += (dst_stride << 1);                                           \
-        }                                                                     \
-        /* x_offset = bilin interpolation and y_offset = 4 */                 \
-      } else if (y_offset == 4) {                                             \
-        __m256i filter, pw8, src_next_reg, src_pack;                          \
-        x_offset <<= 5;                                                       \
-        filter = _mm256_load_si256(                                           \
-            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        /* load and insert source and next row source */                      \
-        LOAD_SRC_NEXT_BYTE_INSERT                                             \
-        MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
-        FILTER_SRC(filter)                                                    \
-        /* convert each 16 bit to 8 bit to each low and high lane source */   \
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
-        src += src_stride << 1;                                               \
-        for (i = 0; i < height - 2; i += 2) {                                 \
-          LOAD_SRC_NEXT_BYTE_INSERT                                           \
-          LOAD_DST_INSERT                                                     \
-          MERGE_WITH_SRC(src_reg, src_next_reg)                               \
-          FILTER_SRC(filter)                                                  \
-          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
-          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
-          /* average between previous pack to the current */                  \
-          src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                 \
-          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src_pack = src_reg;                                                 \
-          src += src_stride << 1;                                             \
-          dst += dst_stride << 1;                                             \
-        }                                                                     \
-        /* last 2 rows processing happens here */                             \
-        LOAD_SRC_MERGE_128BIT(filter)                                         \
-        LOAD_DST_INSERT                                                       \
-        FILTER_SRC_128BIT(filter_128bit)                                      \
-        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
-        src_next_reg = _mm256_permute2x128_si256(                             \
-            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
-        /* average between previous pack to the current */                    \
-        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                   \
-        MERGE_WITH_SRC(src_pack, zero_reg)                                    \
-        CALC_SUM_SSE_INSIDE_LOOP                                              \
-      } else {                                                                \
-        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
-         */                                                                   \
-        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
-        x_offset <<= 5;                                                       \
-        xfilter = _mm256_load_si256(                                          \
-            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
-        y_offset <<= 5;                                                       \
-        yfilter = _mm256_load_si256(                                          \
-            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
-        pw8 = _mm256_set1_epi16(8);                                           \
-        /* load and insert source and next row source */                      \
-        LOAD_SRC_NEXT_BYTE_INSERT                                             \
-        MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
-        FILTER_SRC(xfilter)                                                   \
-        /* convert each 16 bit to 8 bit to each low and high lane source */   \
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
-        src += src_stride << 1;                                               \
-        for (i = 0; i < height - 2; i += 2) {                                 \
-          LOAD_SRC_NEXT_BYTE_INSERT                                           \
-          LOAD_DST_INSERT                                                     \
-          MERGE_WITH_SRC(src_reg, src_next_reg)                               \
-          FILTER_SRC(xfilter)                                                 \
-          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
-          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
-          /* average between previous pack to the current */                  \
-          MERGE_WITH_SRC(src_pack, src_next_reg)                              \
-          /* filter the source */                                             \
-          FILTER_SRC(yfilter)                                                 \
-          src_pack = src_reg;                                                 \
-          CALC_SUM_SSE_INSIDE_LOOP                                            \
-          src += src_stride << 1;                                             \
-          dst += dst_stride << 1;                                             \
-        }                                                                     \
-        /* last 2 rows processing happens here */                             \
-        LOAD_SRC_MERGE_128BIT(xfilter)                                        \
-        LOAD_DST_INSERT                                                       \
-        FILTER_SRC_128BIT(filter_128bit)                                      \
-        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
-        src_next_reg = _mm256_permute2x128_si256(                             \
-            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
-        MERGE_WITH_SRC(src_pack, src_next_reg)                                \
-        FILTER_SRC(yfilter)                                                   \
-        CALC_SUM_SSE_INSIDE_LOOP                                              \
-      }                                                                       \
-    }                                                                         \
-    CALC_SUM_AND_SSE                                                          \
-    _mm256_zeroupper();                                                       \
-    return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height));   \
+#define MAKE_SUB_PIXEL_VAR_16XH(height, log2height)                            \
+  unsigned int aom_sub_pixel_variance16x##height##_avx2(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                 \
+    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;  \
+    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;     \
+    __m256i zero_reg;                                                          \
+    int i, sum;                                                                \
+    sum_reg = _mm256_setzero_si256();                                          \
+    sse_reg = _mm256_setzero_si256();                                          \
+    zero_reg = _mm256_setzero_si256();                                         \
+                                                                               \
+    /* x_offset = 0 and y_offset = 0 */                                        \
+    if (x_offset == 0) {                                                       \
+      if (y_offset == 0) {                                                     \
+        for (i = 0; i < height; i += 2) {                                      \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                          \
+          /* expend each byte to 2 bytes */                                    \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                    \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += (src_stride << 1);                                            \
+          dst += (dst_stride << 1);                                            \
+        }                                                                      \
+        /* x_offset = 0 and y_offset = 4 */                                    \
+      } else if (y_offset == 4) {                                              \
+        __m256i src_next_reg;                                                  \
+        for (i = 0; i < height; i += 2) {                                      \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                          \
+          AVG_NEXT_SRC_INSERT(src_reg, src_stride)                             \
+          /* expend each byte to 2 bytes */                                    \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                    \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += (src_stride << 1);                                            \
+          dst += (dst_stride << 1);                                            \
+        }                                                                      \
+        /* x_offset = 0 and y_offset = bilin interpolation */                  \
+      } else {                                                                 \
+        __m256i filter, pw8, src_next_reg;                                     \
+        y_offset <<= 5;                                                        \
+        filter = _mm256_load_si256(                                            \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        for (i = 0; i < height; i += 2) {                                      \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                          \
+          MERGE_NEXT_SRC_INSERT(src_reg, src_stride)                           \
+          FILTER_SRC(filter)                                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += (src_stride << 1);                                            \
+          dst += (dst_stride << 1);                                            \
+        }                                                                      \
+      }                                                                        \
+      /* x_offset = 4  and y_offset = 0 */                                     \
+    } else if (x_offset == 4) {                                                \
+      if (y_offset == 0) {                                                     \
+        __m256i src_next_reg;                                                  \
+        for (i = 0; i < height; i += 2) {                                      \
+          LOAD_SRC_NEXT_BYTE_INSERT                                            \
+          LOAD_DST_INSERT                                                      \
+          /* average between current and next stride source */                 \
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);                    \
+          /* expand each byte to 2 bytes */                                    \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                    \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += (src_stride << 1);                                            \
+          dst += (dst_stride << 1);                                            \
+        }                                                                      \
+        /* x_offset = 4  and y_offset = 4 */                                   \
+      } else if (y_offset == 4) {                                              \
+        __m256i src_next_reg, src_avg, src_temp;                               \
+        /* load and insert source and next row source */                       \
+        LOAD_SRC_NEXT_BYTE_INSERT                                              \
+        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                      \
+        src += src_stride << 1;                                                \
+        for (i = 0; i < height - 2; i += 2) {                                  \
+          LOAD_SRC_NEXT_BYTE_INSERT                                            \
+          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);               \
+          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);   \
+          src_temp = _mm256_avg_epu8(src_avg, src_temp);                       \
+          LOAD_DST_INSERT                                                      \
+          /* expand each byte to 2 bytes */                                    \
+          MERGE_WITH_SRC(src_temp, zero_reg)                                   \
+          /* save current source average */                                    \
+          src_avg = src_next_reg;                                              \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          dst += dst_stride << 1;                                              \
+          src += src_stride << 1;                                              \
+        }                                                                      \
+        /* last 2 rows processing happens here */                              \
+        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                 \
+        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));             \
+        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                        \
+        src_next_reg = _mm256_permute2x128_si256(                              \
+            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                 \
+        LOAD_DST_INSERT                                                        \
+        src_avg = _mm256_avg_epu8(src_avg, src_next_reg);                      \
+        MERGE_WITH_SRC(src_avg, zero_reg)                                      \
+        CALC_SUM_SSE_INSIDE_LOOP                                               \
+      } else {                                                                 \
+        /* x_offset = 4  and y_offset = bilin interpolation */                 \
+        __m256i filter, pw8, src_next_reg, src_avg, src_temp;                  \
+        y_offset <<= 5;                                                        \
+        filter = _mm256_load_si256(                                            \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        /* load and insert source and next row source */                       \
+        LOAD_SRC_NEXT_BYTE_INSERT                                              \
+        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                      \
+        src += src_stride << 1;                                                \
+        for (i = 0; i < height - 2; i += 2) {                                  \
+          LOAD_SRC_NEXT_BYTE_INSERT                                            \
+          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);               \
+          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);   \
+          LOAD_DST_INSERT                                                      \
+          MERGE_WITH_SRC(src_avg, src_temp)                                    \
+          /* save current source average */                                    \
+          src_avg = src_next_reg;                                              \
+          FILTER_SRC(filter)                                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          dst += dst_stride << 1;                                              \
+          src += src_stride << 1;                                              \
+        }                                                                      \
+        /* last 2 rows processing happens here */                              \
+        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                 \
+        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));             \
+        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                        \
+        src_next_reg = _mm256_permute2x128_si256(                              \
+            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                 \
+        LOAD_DST_INSERT                                                        \
+        MERGE_WITH_SRC(src_avg, src_next_reg)                                  \
+        FILTER_SRC(filter)                                                     \
+        CALC_SUM_SSE_INSIDE_LOOP                                               \
+      }                                                                        \
+      /* x_offset = bilin interpolation and y_offset = 0 */                    \
+    } else {                                                                   \
+      if (y_offset == 0) {                                                     \
+        __m256i filter, pw8, src_next_reg;                                     \
+        x_offset <<= 5;                                                        \
+        filter = _mm256_load_si256(                                            \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        for (i = 0; i < height; i += 2) {                                      \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                          \
+          MERGE_NEXT_SRC_INSERT(src_reg, 1)                                    \
+          FILTER_SRC(filter)                                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src += (src_stride << 1);                                            \
+          dst += (dst_stride << 1);                                            \
+        }                                                                      \
+        /* x_offset = bilin interpolation and y_offset = 4 */                  \
+      } else if (y_offset == 4) {                                              \
+        __m256i filter, pw8, src_next_reg, src_pack;                           \
+        x_offset <<= 5;                                                        \
+        filter = _mm256_load_si256(                                            \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        /* load and insert source and next row source */                       \
+        LOAD_SRC_NEXT_BYTE_INSERT                                              \
+        MERGE_WITH_SRC(src_reg, src_next_reg)                                  \
+        FILTER_SRC(filter)                                                     \
+        /* convert each 16 bit to 8 bit to each low and high lane source */    \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);                \
+        src += src_stride << 1;                                                \
+        for (i = 0; i < height - 2; i += 2) {                                  \
+          LOAD_SRC_NEXT_BYTE_INSERT                                            \
+          LOAD_DST_INSERT                                                      \
+          MERGE_WITH_SRC(src_reg, src_next_reg)                                \
+          FILTER_SRC(filter)                                                   \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);   \
+          /* average between previous pack to the current */                   \
+          src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                  \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                             \
+          src_pack = src_reg;                                                  \
+          src += src_stride << 1;                                              \
+          dst += dst_stride << 1;                                              \
+        }                                                                      \
+        /* last 2 rows processing happens here */                              \
+        LOAD_SRC_MERGE_128BIT(filter)                                          \
+        LOAD_DST_INSERT                                                        \
+        FILTER_SRC_128BIT(filter_128bit)                                       \
+        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                          \
+        src_next_reg = _mm256_permute2x128_si256(                              \
+            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);                \
+        /* average between previous pack to the current */                     \
+        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                    \
+        MERGE_WITH_SRC(src_pack, zero_reg)                                     \
+        CALC_SUM_SSE_INSIDE_LOOP                                               \
+      } else {                                                                 \
+        __m256i xfilter, yfilter, pw8, mask_00ff;                              \
+        __m256i p0, p1;                                                        \
+        const uint8_t *src_ptr = src;                                          \
+        x_offset <<= 5;                                                        \
+        xfilter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));              \
+        y_offset <<= 5;                                                        \
+        yfilter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));              \
+        pw8 = _mm256_set1_epi16(8);                                            \
+        mask_00ff = _mm256_set1_epi16(0x00ff);                                 \
+                                                                               \
+        {                                                                      \
+          __m256i s0 = _mm256_inserti128_si256(                                \
+              _mm256_castsi128_si256(                                          \
+                  _mm_loadu_si128((__m128i const *)(src_ptr))),                \
+              _mm_loadu_si128((__m128i const *)(src_ptr + src_stride)), 1);    \
+          __m256i s1 = _mm256_inserti128_si256(                                \
+              _mm256_castsi128_si256(                                          \
+                  _mm_loadu_si128((__m128i const *)(src_ptr + 1))),            \
+              _mm_loadu_si128((__m128i const *)(src_ptr + src_stride + 1)),    \
+              1);                                                              \
+          __m256i he = _mm256_maddubs_epi16(s0, xfilter);                      \
+          __m256i ho = _mm256_maddubs_epi16(s1, xfilter);                      \
+          he = _mm256_srai_epi16(_mm256_add_epi16(he, pw8), 4);                \
+          ho = _mm256_srai_epi16(_mm256_add_epi16(ho, pw8), 4);                \
+          p0 = _mm256_packus_epi16(he, ho);                                    \
+          src_ptr += src_stride << 1;                                          \
+        }                                                                      \
+        {                                                                      \
+          __m256i s0 = _mm256_inserti128_si256(                                \
+              _mm256_castsi128_si256(                                          \
+                  _mm_loadu_si128((__m128i const *)(src_ptr))),                \
+              _mm_loadu_si128((__m128i const *)(src_ptr + src_stride)), 1);    \
+          __m256i s1 = _mm256_inserti128_si256(                                \
+              _mm256_castsi128_si256(                                          \
+                  _mm_loadu_si128((__m128i const *)(src_ptr + 1))),            \
+              _mm_loadu_si128((__m128i const *)(src_ptr + src_stride + 1)),    \
+              1);                                                              \
+          __m256i he = _mm256_maddubs_epi16(s0, xfilter);                      \
+          __m256i ho = _mm256_maddubs_epi16(s1, xfilter);                      \
+          he = _mm256_srai_epi16(_mm256_add_epi16(he, pw8), 4);                \
+          ho = _mm256_srai_epi16(_mm256_add_epi16(ho, pw8), 4);                \
+          p1 = _mm256_packus_epi16(he, ho);                                    \
+          src_ptr += src_stride << 1;                                          \
+        }                                                                      \
+                                                                               \
+        for (i = 0; i < height - 4; i += 2) {                                  \
+          __m256i p2, p_mix, v_ev, v_od, dst_ev, dst_od;                       \
+          __m256i diff_ev, diff_od, sum_comb, sse_comb;                        \
+                                                                               \
+          {                                                                    \
+            __m256i s0 = _mm256_inserti128_si256(                              \
+                _mm256_castsi128_si256(                                        \
+                    _mm_loadu_si128((__m128i const *)(src_ptr))),              \
+                _mm_loadu_si128((__m128i const *)(src_ptr + src_stride)), 1);  \
+            __m256i s1 = _mm256_inserti128_si256(                              \
+                _mm256_castsi128_si256(                                        \
+                    _mm_loadu_si128((__m128i const *)(src_ptr + 1))),          \
+                _mm_loadu_si128((__m128i const *)(src_ptr + src_stride + 1)),  \
+                1);                                                            \
+            __m256i he = _mm256_maddubs_epi16(s0, xfilter);                    \
+            __m256i ho = _mm256_maddubs_epi16(s1, xfilter);                    \
+            he = _mm256_srai_epi16(_mm256_add_epi16(he, pw8), 4);              \
+            ho = _mm256_srai_epi16(_mm256_add_epi16(ho, pw8), 4);              \
+            p2 = _mm256_packus_epi16(he, ho);                                  \
+            src_ptr += src_stride << 1;                                        \
+          }                                                                    \
+                                                                               \
+          p_mix = _mm256_permute2x128_si256(p0, p1, 0x21);                     \
+          v_ev =                                                               \
+              _mm256_maddubs_epi16(_mm256_unpacklo_epi8(p0, p_mix), yfilter);  \
+          v_od =                                                               \
+              _mm256_maddubs_epi16(_mm256_unpackhi_epi8(p0, p_mix), yfilter);  \
+          v_ev = _mm256_srai_epi16(_mm256_add_epi16(v_ev, pw8), 4);            \
+          v_od = _mm256_srai_epi16(_mm256_add_epi16(v_od, pw8), 4);            \
+                                                                               \
+          dst_reg = _mm256_inserti128_si256(                                   \
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(dst))), \
+              _mm_loadu_si128((__m128i const *)(dst + dst_stride)), 1);        \
+          dst += dst_stride << 1;                                              \
+          dst_ev = _mm256_and_si256(dst_reg, mask_00ff);                       \
+          dst_od = _mm256_srli_epi16(dst_reg, 8);                              \
+          diff_ev = _mm256_sub_epi16(v_ev, dst_ev);                            \
+          diff_od = _mm256_sub_epi16(v_od, dst_od);                            \
+                                                                               \
+          sum_comb = _mm256_add_epi16(diff_ev, diff_od);                       \
+          sum_reg = _mm256_add_epi16(sum_reg, sum_comb);                       \
+                                                                               \
+          sse_comb = _mm256_add_epi32(_mm256_madd_epi16(diff_ev, diff_ev),     \
+                                      _mm256_madd_epi16(diff_od, diff_od));    \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_comb);                       \
+                                                                               \
+          p0 = p1;                                                             \
+          p1 = p2;                                                             \
+        }                                                                      \
+                                                                               \
+        {                                                                      \
+          __m256i p_mix = _mm256_permute2x128_si256(p0, p1, 0x21);             \
+          __m256i v_ev =                                                       \
+              _mm256_maddubs_epi16(_mm256_unpacklo_epi8(p0, p_mix), yfilter);  \
+          __m256i v_od =                                                       \
+              _mm256_maddubs_epi16(_mm256_unpackhi_epi8(p0, p_mix), yfilter);  \
+          __m256i dst_ev, dst_od, diff_ev, diff_od, sum_comb, sse_comb;        \
+          v_ev = _mm256_srai_epi16(_mm256_add_epi16(v_ev, pw8), 4);            \
+          v_od = _mm256_srai_epi16(_mm256_add_epi16(v_od, pw8), 4);            \
+                                                                               \
+          dst_reg = _mm256_inserti128_si256(                                   \
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(dst))), \
+              _mm_loadu_si128((__m128i const *)(dst + dst_stride)), 1);        \
+          dst += dst_stride << 1;                                              \
+          dst_ev = _mm256_and_si256(dst_reg, mask_00ff);                       \
+          dst_od = _mm256_srli_epi16(dst_reg, 8);                              \
+          diff_ev = _mm256_sub_epi16(v_ev, dst_ev);                            \
+          diff_od = _mm256_sub_epi16(v_od, dst_od);                            \
+                                                                               \
+          sum_comb = _mm256_add_epi16(diff_ev, diff_od);                       \
+          sum_reg = _mm256_add_epi16(sum_reg, sum_comb);                       \
+                                                                               \
+          sse_comb = _mm256_add_epi32(_mm256_madd_epi16(diff_ev, diff_ev),     \
+                                      _mm256_madd_epi16(diff_od, diff_od));    \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_comb);                       \
+        }                                                                      \
+        {                                                                      \
+          __m128i s0 = _mm_loadu_si128((__m128i const *)(src_ptr));            \
+          __m128i s1 = _mm_loadu_si128((__m128i const *)(src_ptr + 1));        \
+          __m128i he = _mm_maddubs_epi16(s0, _mm256_castsi256_si128(xfilter)); \
+          __m128i ho = _mm_maddubs_epi16(s1, _mm256_castsi256_si128(xfilter)); \
+          __m256i p_last, p_mix, v_ev, v_od, dst_ev, dst_od;                   \
+          __m256i diff_ev, diff_od, sum_comb, sse_comb;                        \
+          he = _mm_srai_epi16(_mm_add_epi16(he, _mm256_castsi256_si128(pw8)),  \
+                              4);                                              \
+          ho = _mm_srai_epi16(_mm_add_epi16(ho, _mm256_castsi256_si128(pw8)),  \
+                              4);                                              \
+          p_last = _mm256_castsi128_si256(_mm_packus_epi16(he, ho));           \
+                                                                               \
+          p_mix = _mm256_permute2x128_si256(p1, p_last, 0x21);                 \
+          v_ev =                                                               \
+              _mm256_maddubs_epi16(_mm256_unpacklo_epi8(p1, p_mix), yfilter);  \
+          v_od =                                                               \
+              _mm256_maddubs_epi16(_mm256_unpackhi_epi8(p1, p_mix), yfilter);  \
+          v_ev = _mm256_srai_epi16(_mm256_add_epi16(v_ev, pw8), 4);            \
+          v_od = _mm256_srai_epi16(_mm256_add_epi16(v_od, pw8), 4);            \
+                                                                               \
+          dst_reg = _mm256_inserti128_si256(                                   \
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(dst))), \
+              _mm_loadu_si128((__m128i const *)(dst + dst_stride)), 1);        \
+          dst_ev = _mm256_and_si256(dst_reg, mask_00ff);                       \
+          dst_od = _mm256_srli_epi16(dst_reg, 8);                              \
+          diff_ev = _mm256_sub_epi16(v_ev, dst_ev);                            \
+          diff_od = _mm256_sub_epi16(v_od, dst_od);                            \
+                                                                               \
+          sum_comb = _mm256_add_epi16(diff_ev, diff_od);                       \
+          sum_reg = _mm256_add_epi16(sum_reg, sum_comb);                       \
+                                                                               \
+          sse_comb = _mm256_add_epi32(_mm256_madd_epi16(diff_ev, diff_ev),     \
+                                      _mm256_madd_epi16(diff_od, diff_od));    \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_comb);                       \
+        }                                                                      \
+                                                                               \
+        {                                                                      \
+          __m256i sse_hi, sum_hi;                                              \
+          int f_sse, f_sum;                                                    \
+          sum_reg = _mm256_madd_epi16(sum_reg, _mm256_set1_epi16(1));          \
+          sse_hi = _mm256_srli_si256(sse_reg, 8);                              \
+          sum_hi = _mm256_srli_si256(sum_reg, 8);                              \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_hi);                         \
+          sum_reg = _mm256_add_epi32(sum_reg, sum_hi);                         \
+          sse_hi = _mm256_srli_si256(sse_reg, 4);                              \
+          sum_hi = _mm256_srli_si256(sum_reg, 4);                              \
+          sse_reg = _mm256_add_epi32(sse_reg, sse_hi);                         \
+          sum_reg = _mm256_add_epi32(sum_reg, sum_hi);                         \
+          f_sse = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +         \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1));     \
+          f_sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +         \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));     \
+          *sse = f_sse;                                                        \
+          _mm256_zeroupper();                                                  \
+          return f_sse -                                                       \
+                 (unsigned int)(((int64_t)f_sum * f_sum) >> (4 + log2height)); \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    CALC_SUM_AND_SSE                                                           \
+    _mm256_zeroupper();                                                        \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height));    \
   }

 MAKE_SUB_PIXEL_VAR_16XH(32, 5)