Commit 1c33438c43 for aom

commit 1c33438c43f5bce1c31173bee8e77ebb954955c7
Author: Li Zhang <li.zhang2@arm.com>
Date:   Mon Feb 23 11:46:33 2026 +0100

    Add Armv8.4 Neon DotProd subpel variance paths

    For block widths >= 8, merge the source pixel bilinear interpolation
    into the variance kernel. For 4x8 and 4x16 blocks - where merging
    bilinear interpolation into the variance kernel is not beneficial - keep
    the original approach but with a direct call to the Neon DotProd
    variance kernel. (The Armv8.0 Neon implementation remains fastest for
    4x4 blocks.)

    Also add the relevant unit tests.

    This is a port from SVT-AV1:
    https://gitlab.com/AOMediaCodec/SVT-AV1/-/merge_requests/2608

    Originally authored by: Jonathan Wright <Jonathan.Wright@arm.com> and
    Gerda Zsejke More <gerdazsejke.more@arm.com>

    Change-Id: I64e0beab7c00087b1a3febc6edbd30e79bf7bc83

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 3047bc7b2b..43ffe68fd6 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -312,6 +312,7 @@ if(CONFIG_AV1_ENCODER)
               "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c"
               "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c"
               "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon_dotprod.c"
               "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c"
               "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c")

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 776d272d23..a3b3564704 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1406,21 +1406,21 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_variance4x8       sse2      neon neon_dotprod/;
   specialize qw/aom_variance4x4       sse2      neon/;

-  specialize qw/aom_sub_pixel_variance128x128   avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance128x64    avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance64x128    avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance64x64     avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32     avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64     avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance32x32     avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16     avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32     avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance16x16     avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8      avx2 neon ssse3/;
-  specialize qw/aom_sub_pixel_variance8x16           neon ssse3/;
-  specialize qw/aom_sub_pixel_variance8x8            neon ssse3/;
-  specialize qw/aom_sub_pixel_variance8x4            neon ssse3/;
-  specialize qw/aom_sub_pixel_variance4x8            neon ssse3/;
+  specialize qw/aom_sub_pixel_variance128x128   avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance64x64     avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance32x32     avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance16x16     avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2 neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance8x16           neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance8x8            neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance8x4            neon neon_dotprod ssse3/;
+  specialize qw/aom_sub_pixel_variance4x8            neon neon_dotprod ssse3/;
   specialize qw/aom_sub_pixel_variance4x4            neon ssse3/;

   specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon ssse3/;
@@ -1448,12 +1448,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
     specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;

-    specialize qw/aom_sub_pixel_variance4x16 neon ssse3/;
-    specialize qw/aom_sub_pixel_variance16x4 neon avx2 ssse3/;
-    specialize qw/aom_sub_pixel_variance8x32 neon ssse3/;
-    specialize qw/aom_sub_pixel_variance32x8 neon ssse3/;
-    specialize qw/aom_sub_pixel_variance16x64 neon avx2 ssse3/;
-    specialize qw/aom_sub_pixel_variance64x16 neon ssse3/;
+    specialize qw/aom_sub_pixel_variance4x16 neon neon_dotprod ssse3/;
+    specialize qw/aom_sub_pixel_variance16x4 neon neon_dotprod avx2 ssse3/;
+    specialize qw/aom_sub_pixel_variance8x32 neon neon_dotprod ssse3/;
+    specialize qw/aom_sub_pixel_variance32x8 neon neon_dotprod ssse3/;
+    specialize qw/aom_sub_pixel_variance16x64 neon neon_dotprod avx2 ssse3/;
+    specialize qw/aom_sub_pixel_variance64x16 neon neon_dotprod ssse3/;
     specialize qw/aom_sub_pixel_avg_variance4x16 neon ssse3/;
     specialize qw/aom_sub_pixel_avg_variance16x4 neon ssse3/;
     specialize qw/aom_sub_pixel_avg_variance8x32 neon ssse3/;
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 489c8d0b5b..2ba6d386b9 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -19,128 +19,7 @@

 #include "aom_dsp/variance.h"
 #include "aom_dsp/arm/mem_neon.h"
-
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
-                                      int src_stride, int pixel_step,
-                                      int dst_height, int filter_offset) {
-  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
-  const uint8x8_t f1 = vdup_n_u8(filter_offset);
-
-  int i = dst_height;
-  do {
-    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
-    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
-    uint16x8_t blend = vmull_u8(s0, f0);
-    blend = vmlal_u8(blend, s1, f1);
-    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
-    vst1_u8(dst_ptr, blend_u8);
-
-    src_ptr += 2 * src_stride;
-    dst_ptr += 2 * 4;
-    i -= 2;
-  } while (i != 0);
-}
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
-                                      int src_stride, int pixel_step,
-                                      int dst_height, int filter_offset) {
-  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
-  const uint8x8_t f1 = vdup_n_u8(filter_offset);
-
-  int i = dst_height;
-  do {
-    uint8x8_t s0 = vld1_u8(src_ptr);
-    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
-    uint16x8_t blend = vmull_u8(s0, f0);
-    blend = vmlal_u8(blend, s1, f1);
-    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
-    vst1_u8(dst_ptr, blend_u8);
-
-    src_ptr += src_stride;
-    dst_ptr += 8;
-  } while (--i != 0);
-}
-
-static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
-                                         uint8_t *dst_ptr, int src_stride,
-                                         int pixel_step, int dst_width,
-                                         int dst_height, int filter_offset) {
-  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
-  const uint8x8_t f1 = vdup_n_u8(filter_offset);
-
-  int i = dst_height;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0 = vld1q_u8(src_ptr + j);
-      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
-      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
-      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
-      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
-      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
-      uint8x16_t blend_u8 =
-          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
-      vst1q_u8(dst_ptr + j, blend_u8);
-
-      j += 16;
-    } while (j < dst_width);
-
-    src_ptr += src_stride;
-    dst_ptr += dst_width;
-  } while (--i != 0);
-}
-
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
-                                       int src_stride, int pixel_step,
-                                       int dst_height, int filter_offset) {
-  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
-                               dst_height, filter_offset);
-}
-
-static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
-                                       int src_stride, int pixel_step,
-                                       int dst_height, int filter_offset) {
-  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
-                               dst_height, filter_offset);
-}
-
-static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
-                                       int src_stride, int pixel_step,
-                                       int dst_height, int filter_offset) {
-  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
-                               dst_height, filter_offset);
-}
-
-static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
-                                        uint8_t *dst_ptr, int src_stride,
-                                        int pixel_step, int dst_height,
-                                        int filter_offset) {
-  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
-                               dst_height, filter_offset);
-}
-
-static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
-                                   int src_stride, int pixel_step,
-                                   int dst_width, int dst_height) {
-  // We only specialise on the filter values for large block sizes (>= 16x16.)
-  assert(dst_width >= 16 && dst_width % 16 == 0);
-
-  int i = dst_height;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0 = vld1q_u8(src_ptr + j);
-      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
-      uint8x16_t avg = vrhaddq_u8(s0, s1);
-      vst1q_u8(dst_ptr + j, avg);
-
-      j += 16;
-    } while (j < dst_width);
-
-    src_ptr += src_stride;
-    dst_ptr += dst_width;
-  } while (--i != 0);
-}
+#include "aom_dsp/arm/subpel_variance_neon.h"

 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
   unsigned int aom_sub_pixel_variance##w##x##h##_neon(                   \
diff --git a/aom_dsp/arm/subpel_variance_neon.h b/aom_dsp/arm/subpel_variance_neon.h
new file mode 100644
index 0000000000..18baba6e5c
--- /dev/null
+++ b/aom_dsp/arm/subpel_variance_neon.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_SUBPEL_VARIANCE_NEON_H_
+#define AOM_AOM_DSP_ARM_SUBPEL_VARIANCE_NEON_H_
+
+#include <arm_neon.h>
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static inline void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                             uint8_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+static inline void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                             uint8_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+  } while (--i != 0);
+}
+
+static inline void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+                                                uint8_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+      vst1q_u8(dst_ptr + j, blend_u8);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static inline void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                              uint8_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+                               dst_height, filter_offset);
+}
+
+static inline void var_filter_block2d_bil_w32(const uint8_t *src_ptr,
+                                              uint8_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+                               dst_height, filter_offset);
+}
+
+static inline void var_filter_block2d_bil_w64(const uint8_t *src_ptr,
+                                              uint8_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+                               dst_height, filter_offset);
+}
+
+static inline void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
+                               dst_height, filter_offset);
+}
+
+static inline void var_filter_block2d_avg(const uint8_t *src_ptr,
+                                          uint8_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#endif  // AOM_AOM_DSP_ARM_SUBPEL_VARIANCE_NEON_H_
diff --git a/aom_dsp/arm/subpel_variance_neon_dotprod.c b/aom_dsp/arm/subpel_variance_neon_dotprod.c
new file mode 100644
index 0000000000..6cb12d82c2
--- /dev/null
+++ b/aom_dsp/arm/subpel_variance_neon_dotprod.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/subpel_variance_neon.h"
+
+static inline void bil_variance_8xh_neon_dotprod(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    int pixel_step, int h, uint32_t *sse, int *sum, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  do {
+    uint8x8_t s0_lo = vld1_u8(src);
+    uint8x8_t s1_lo = vld1_u8(src + pixel_step);
+    uint16x8_t blend_l = vmull_u8(s0_lo, f0);
+    blend_l = vmlal_u8(blend_l, s1_lo, f1);
+    uint8x8_t s0_hi = vld1_u8(src + src_stride);
+    uint8x8_t s1_hi = vld1_u8(src + src_stride + pixel_step);
+    uint16x8_t blend_h = vmull_u8(s0_hi, f0);
+    blend_h = vmlal_u8(blend_h, s1_hi, f1);
+    uint8x16_t s =
+        vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+    uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    h -= 2;
+  } while (h != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = vaddvq_s32(sum_diff);
+  *sse = vaddvq_u32(sse_u32);
+}
+
+static inline void bil_variance_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int pixel_step, int w, int h,
+                                             uint32_t *sse, int *sum,
+                                             int filter_offset) {
+  assert(w != 4);
+
+  if (w == 8) {
+    bil_variance_8xh_neon_dotprod(src, src_stride, ref, ref_stride, pixel_step,
+                                  h, sse, sum, filter_offset);
+    return;
+  }
+
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  do {
+    int i = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src + i);
+      uint8x16_t s1 = vld1q_u8(src + i + pixel_step);
+      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+      uint8x16_t s =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+      uint8x16_t r = vld1q_u8(ref + i);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      i += 16;
+    } while (i < w);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--h != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = vaddvq_s32(sum_diff);
+  *sse = vaddvq_u32(sse_u32);
+}
+
+static inline void avg_variance_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int pixel_step, int w, int h,
+                                             uint32_t *sse, int *sum) {
+  assert(w >= 16 && w % 16 == 0);
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  do {
+    int i = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src + i);
+      uint8x16_t s1 = vld1q_u8(src + i + pixel_step);
+      uint8x16_t s = vrhaddq_u8(s0, s1);
+      uint8x16_t r = vld1q_u8(ref + i);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      i += 16;
+    } while (i < w);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--h != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = vaddvq_s32(sum_diff);
+  *sse = vaddvq_u32(sse_u32);
+}
+
+#define SUBPEL_VARIANCE_4XH_NEON_DOTPROD(h, padding)                        \
+  unsigned int aom_sub_pixel_variance4x##h##_neon_dotprod(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                  \
+    uint8_t tmp0[4 * (h + padding)];                                        \
+    uint8_t tmp1[4 * h];                                                    \
+    var_filter_block2d_bil_w4(src, tmp0, src_stride, 1, (h + padding),      \
+                              xoffset);                                     \
+    var_filter_block2d_bil_w4(tmp0, tmp1, 4, 4, h, yoffset);                \
+    return aom_variance4x##h##_neon_dotprod(tmp1, 4, ref, ref_stride, sse); \
+  }
+
+#define SUBPEL_VARIANCE_WXH_NEON_DOTPROD(w, h, shift, padding)             \
+  unsigned int aom_sub_pixel_variance##w##x##h##_neon_dotprod(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {             \
+    uint8_t tmp[w * (h + padding)];                                        \
+    int sum;                                                               \
+    var_filter_block2d_bil_w##w(src, tmp, src_stride, 1, h + padding,      \
+                                xoffset);                                  \
+    bil_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse, &sum, \
+                              yoffset);                                    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
+  }
+
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(w, h, shift, padding)     \
+  unsigned int aom_sub_pixel_variance##w##x##h##_neon_dotprod(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    int sum;                                                                   \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return aom_variance##w##x##h##_neon_dotprod(src, src_stride, ref,      \
+                                                    ref_stride, sse);          \
+      } else if (yoffset == 4) {                                               \
+        avg_variance_neon_dotprod(src, src_stride, ref, ref_stride,            \
+                                  src_stride, w, h, sse, &sum);                \
+      } else {                                                                 \
+        bil_variance_neon_dotprod(src, src_stride, ref, ref_stride,            \
+                                  src_stride, w, h, sse, &sum, yoffset);       \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp[w * (h + padding)];                                          \
+      if (yoffset == 0) {                                                      \
+        avg_variance_neon_dotprod(src, src_stride, ref, ref_stride, 1, w, h,   \
+                                  sse, &sum);                                  \
+      } else if (yoffset == 4) {                                               \
+        var_filter_block2d_avg(src, tmp, src_stride, 1, w, h + padding);       \
+        avg_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse,       \
+                                  &sum);                                       \
+      } else {                                                                 \
+        var_filter_block2d_avg(src, tmp, src_stride, 1, w, h + padding);       \
+        bil_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse, &sum, \
+                                  yoffset);                                    \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp[w * (h + padding)];                                          \
+      if (yoffset == 0) {                                                      \
+        bil_variance_neon_dotprod(src, src_stride, ref, ref_stride, 1, w, h,   \
+                                  sse, &sum, xoffset);                         \
+      } else if (yoffset == 4) {                                               \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, 1, h + padding,      \
+                                    xoffset);                                  \
+        avg_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse,       \
+                                  &sum);                                       \
+      } else {                                                                 \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, 1, h + padding,      \
+                                    xoffset);                                  \
+        bil_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse, &sum, \
+                                  yoffset);                                    \
+      }                                                                        \
+    }                                                                          \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                   \
+  }
+
+SUBPEL_VARIANCE_4XH_NEON_DOTPROD(8, 2)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(8, 4, 5, 1)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(8, 8, 6, 1)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(8, 16, 7, 1)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 8, 7, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 32, 9, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(32, 16, 9, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(32, 32, 10, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(32, 64, 11, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(64, 32, 11, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(64, 64, 12, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(64, 128, 13, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(128, 64, 13, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(128, 128, 14, 1)
+
+#if !CONFIG_REALTIME_ONLY
+SUBPEL_VARIANCE_4XH_NEON_DOTPROD(16, 2)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(8, 32, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 4, 6, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 64, 10, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(32, 8, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(64, 16, 10, 1)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_VARIANCE_WXH_NEON_DOTPROD
+#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 2a1bf4f165..ff5caeb35e 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -3674,6 +3674,35 @@ const VarianceParams kArrayVariance_neon_dotprod[] = {
 INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AvxVarianceTest,
                          ::testing::ValuesIn(kArrayVariance_neon_dotprod));

+const SubpelVarianceParams kArraySubpelVariance_neon_dotprod[] = {
+  SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_neon_dotprod, 0),
+  SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_neon_dotprod, 0),
+  SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_neon_dotprod, 0),
+  SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon_dotprod, 0),
+  SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_neon_dotprod, 0),
+  SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_neon_dotprod, 0),
+  SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon_dotprod, 0),
+  SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_neon_dotprod, 0),
+  SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_neon_dotprod, 0),
+  SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon_dotprod, 0),
+  SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_neon_dotprod, 0),
+  SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_neon_dotprod, 0),
+  SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon_dotprod, 0),
+  SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_neon_dotprod, 0),
+  SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_neon_dotprod, 0),
+#if !CONFIG_REALTIME_ONLY
+  SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_neon_dotprod, 0),
+  SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_neon_dotprod, 0),
+  SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_neon_dotprod, 0),
+  SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_neon_dotprod, 0),
+  SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_neon_dotprod, 0),
+  SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_neon_dotprod, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, AvxSubpelVarianceTest,
+    ::testing::ValuesIn(kArraySubpelVariance_neon_dotprod));
+
 const GetSseSumParams kArrayGetSseSum8x8Quad_neon_dotprod[] = {
   GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
   GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),