Commit 1c33438c43 for aom
commit 1c33438c43f5bce1c31173bee8e77ebb954955c7
Author: Li Zhang <li.zhang2@arm.com>
Date: Mon Feb 23 11:46:33 2026 +0100
Add Armv8.4 Neon DotProd subpel variance paths
For block widths >= 8, merge the source pixel bilinear interpolation
into the variance kernel. For 4x8 and 4x16 blocks - where merging
bilinear interpolation into the variance kernel is not beneficial - keep
the original approach but with a direct call to the Neon DotProd
variance kernel. (The Armv8.0 Neon implementation remains fastest for
4x4 blocks.)
Also add the relevant unit tests.
This is a port from SVT-AV1:
https://gitlab.com/AOMediaCodec/SVT-AV1/-/merge_requests/2608
Originally authored by: Jonathan Wright <Jonathan.Wright@arm.com> and
Gerda Zsejke More <gerdazsejke.more@arm.com>
Change-Id: I64e0beab7c00087b1a3febc6edbd30e79bf7bc83
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 3047bc7b2b..43ffe68fd6 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -312,6 +312,7 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c"
"${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c"
"${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c"
+ "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon_dotprod.c"
"${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c"
"${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c")
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 776d272d23..a3b3564704 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1406,21 +1406,21 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_variance4x8 sse2 neon neon_dotprod/;
specialize qw/aom_variance4x4 sse2 neon/;
- specialize qw/aom_sub_pixel_variance128x128 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance128x64 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance64x128 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance64x64 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance64x32 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance32x64 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance32x32 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance32x16 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance16x32 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance16x16 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance16x8 avx2 neon ssse3/;
- specialize qw/aom_sub_pixel_variance8x16 neon ssse3/;
- specialize qw/aom_sub_pixel_variance8x8 neon ssse3/;
- specialize qw/aom_sub_pixel_variance8x4 neon ssse3/;
- specialize qw/aom_sub_pixel_variance4x8 neon ssse3/;
+ specialize qw/aom_sub_pixel_variance128x128 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance128x64 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance64x128 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance64x64 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance64x32 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance32x64 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance32x32 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance32x16 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance16x32 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance16x16 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance16x8 avx2 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance8x16 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance8x8 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance8x4 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance4x8 neon neon_dotprod ssse3/;
specialize qw/aom_sub_pixel_variance4x4 neon ssse3/;
specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon ssse3/;
@@ -1448,12 +1448,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;
- specialize qw/aom_sub_pixel_variance4x16 neon ssse3/;
- specialize qw/aom_sub_pixel_variance16x4 neon avx2 ssse3/;
- specialize qw/aom_sub_pixel_variance8x32 neon ssse3/;
- specialize qw/aom_sub_pixel_variance32x8 neon ssse3/;
- specialize qw/aom_sub_pixel_variance16x64 neon avx2 ssse3/;
- specialize qw/aom_sub_pixel_variance64x16 neon ssse3/;
+ specialize qw/aom_sub_pixel_variance4x16 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance16x4 neon neon_dotprod avx2 ssse3/;
+ specialize qw/aom_sub_pixel_variance8x32 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance32x8 neon neon_dotprod ssse3/;
+ specialize qw/aom_sub_pixel_variance16x64 neon neon_dotprod avx2 ssse3/;
+ specialize qw/aom_sub_pixel_variance64x16 neon neon_dotprod ssse3/;
specialize qw/aom_sub_pixel_avg_variance4x16 neon ssse3/;
specialize qw/aom_sub_pixel_avg_variance16x4 neon ssse3/;
specialize qw/aom_sub_pixel_avg_variance8x32 neon ssse3/;
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 489c8d0b5b..2ba6d386b9 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -19,128 +19,7 @@
#include "aom_dsp/variance.h"
#include "aom_dsp/arm/mem_neon.h"
-
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
- int src_stride, int pixel_step,
- int dst_height, int filter_offset) {
- const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
- const uint8x8_t f1 = vdup_n_u8(filter_offset);
-
- int i = dst_height;
- do {
- uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
- uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
- uint16x8_t blend = vmull_u8(s0, f0);
- blend = vmlal_u8(blend, s1, f1);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
- vst1_u8(dst_ptr, blend_u8);
-
- src_ptr += 2 * src_stride;
- dst_ptr += 2 * 4;
- i -= 2;
- } while (i != 0);
-}
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
- int src_stride, int pixel_step,
- int dst_height, int filter_offset) {
- const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
- const uint8x8_t f1 = vdup_n_u8(filter_offset);
-
- int i = dst_height;
- do {
- uint8x8_t s0 = vld1_u8(src_ptr);
- uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
- uint16x8_t blend = vmull_u8(s0, f0);
- blend = vmlal_u8(blend, s1, f1);
- uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
- vst1_u8(dst_ptr, blend_u8);
-
- src_ptr += src_stride;
- dst_ptr += 8;
- } while (--i != 0);
-}
-
-static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
- uint8_t *dst_ptr, int src_stride,
- int pixel_step, int dst_width,
- int dst_height, int filter_offset) {
- const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
- const uint8x8_t f1 = vdup_n_u8(filter_offset);
-
- int i = dst_height;
- do {
- int j = 0;
- do {
- uint8x16_t s0 = vld1q_u8(src_ptr + j);
- uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
- uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
- blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
- uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
- blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
- uint8x16_t blend_u8 =
- vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
- vst1q_u8(dst_ptr + j, blend_u8);
-
- j += 16;
- } while (j < dst_width);
-
- src_ptr += src_stride;
- dst_ptr += dst_width;
- } while (--i != 0);
-}
-
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
- int src_stride, int pixel_step,
- int dst_height, int filter_offset) {
- var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
- dst_height, filter_offset);
-}
-
-static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
- int src_stride, int pixel_step,
- int dst_height, int filter_offset) {
- var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
- dst_height, filter_offset);
-}
-
-static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
- int src_stride, int pixel_step,
- int dst_height, int filter_offset) {
- var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
- dst_height, filter_offset);
-}
-
-static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
- uint8_t *dst_ptr, int src_stride,
- int pixel_step, int dst_height,
- int filter_offset) {
- var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
- dst_height, filter_offset);
-}
-
-static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
- int src_stride, int pixel_step,
- int dst_width, int dst_height) {
- // We only specialise on the filter values for large block sizes (>= 16x16.)
- assert(dst_width >= 16 && dst_width % 16 == 0);
-
- int i = dst_height;
- do {
- int j = 0;
- do {
- uint8x16_t s0 = vld1q_u8(src_ptr + j);
- uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
- uint8x16_t avg = vrhaddq_u8(s0, s1);
- vst1q_u8(dst_ptr + j, avg);
-
- j += 16;
- } while (j < dst_width);
-
- src_ptr += src_stride;
- dst_ptr += dst_width;
- } while (--i != 0);
-}
+#include "aom_dsp/arm/subpel_variance_neon.h"
#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
diff --git a/aom_dsp/arm/subpel_variance_neon.h b/aom_dsp/arm/subpel_variance_neon.h
new file mode 100644
index 0000000000..18baba6e5c
--- /dev/null
+++ b/aom_dsp/arm/subpel_variance_neon.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_SUBPEL_VARIANCE_NEON_H_
+#define AOM_AOM_DSP_ARM_SUBPEL_VARIANCE_NEON_H_
+
+#include <arm_neon.h>
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static inline void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+static inline void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ } while (--i != 0);
+}
+
+static inline void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+ uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height,
+ int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+ vst1q_u8(dst_ptr + j, blend_u8);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static inline void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+ dst_height, filter_offset);
+}
+
+static inline void var_filter_block2d_bil_w32(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+ dst_height, filter_offset);
+}
+
+static inline void var_filter_block2d_bil_w64(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+ dst_height, filter_offset);
+}
+
+static inline void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
+ dst_height, filter_offset);
+}
+
+static inline void var_filter_block2d_avg(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#endif // AOM_AOM_DSP_ARM_SUBPEL_VARIANCE_NEON_H_
diff --git a/aom_dsp/arm/subpel_variance_neon_dotprod.c b/aom_dsp/arm/subpel_variance_neon_dotprod.c
new file mode 100644
index 0000000000..6cb12d82c2
--- /dev/null
+++ b/aom_dsp/arm/subpel_variance_neon_dotprod.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/subpel_variance_neon.h"
+
+static inline void bil_variance_8xh_neon_dotprod(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ int pixel_step, int h, uint32_t *sse, int *sum, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ do {
+ uint8x8_t s0_lo = vld1_u8(src);
+ uint8x8_t s1_lo = vld1_u8(src + pixel_step);
+ uint16x8_t blend_l = vmull_u8(s0_lo, f0);
+ blend_l = vmlal_u8(blend_l, s1_lo, f1);
+ uint8x8_t s0_hi = vld1_u8(src + src_stride);
+ uint8x8_t s1_hi = vld1_u8(src + src_stride + pixel_step);
+ uint16x8_t blend_h = vmull_u8(s0_hi, f0);
+ blend_h = vmlal_u8(blend_h, s1_hi, f1);
+ uint8x16_t s =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+ uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ h -= 2;
+ } while (h != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = vaddvq_s32(sum_diff);
+ *sse = vaddvq_u32(sse_u32);
+}
+
+static inline void bil_variance_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int pixel_step, int w, int h,
+ uint32_t *sse, int *sum,
+ int filter_offset) {
+ assert(w != 4);
+
+ if (w == 8) {
+ bil_variance_8xh_neon_dotprod(src, src_stride, ref, ref_stride, pixel_step,
+ h, sse, sum, filter_offset);
+ return;
+ }
+
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ do {
+ int i = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src + i);
+ uint8x16_t s1 = vld1q_u8(src + i + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t s =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+ uint8x16_t r = vld1q_u8(ref + i);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ i += 16;
+ } while (i < w);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--h != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = vaddvq_s32(sum_diff);
+ *sse = vaddvq_u32(sse_u32);
+}
+
+static inline void avg_variance_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int pixel_step, int w, int h,
+ uint32_t *sse, int *sum) {
+ assert(w >= 16 && w % 16 == 0);
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ do {
+ int i = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src + i);
+ uint8x16_t s1 = vld1q_u8(src + i + pixel_step);
+ uint8x16_t s = vrhaddq_u8(s0, s1);
+ uint8x16_t r = vld1q_u8(ref + i);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ i += 16;
+ } while (i < w);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--h != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = vaddvq_s32(sum_diff);
+ *sse = vaddvq_u32(sse_u32);
+}
+
+#define SUBPEL_VARIANCE_4XH_NEON_DOTPROD(h, padding) \
+ unsigned int aom_sub_pixel_variance4x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint8_t tmp0[4 * (h + padding)]; \
+ uint8_t tmp1[4 * h]; \
+ var_filter_block2d_bil_w4(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w4(tmp0, tmp1, 4, 4, h, yoffset); \
+ return aom_variance4x##h##_neon_dotprod(tmp1, 4, ref, ref_stride, sse); \
+ }
+
+#define SUBPEL_VARIANCE_WXH_NEON_DOTPROD(w, h, shift, padding) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ uint8_t tmp[w * (h + padding)]; \
+ int sum; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, 1, h + padding, \
+ xoffset); \
+ bil_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse, &sum, \
+ yoffset); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(w, h, shift, padding) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ int sum; \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_variance##w##x##h##_neon_dotprod(src, src_stride, ref, \
+ ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ avg_variance_neon_dotprod(src, src_stride, ref, ref_stride, \
+ src_stride, w, h, sse, &sum); \
+ } else { \
+ bil_variance_neon_dotprod(src, src_stride, ref, ref_stride, \
+ src_stride, w, h, sse, &sum, yoffset); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_variance_neon_dotprod(src, src_stride, ref, ref_stride, 1, w, h, \
+ sse, &sum); \
+ } else if (yoffset == 4) { \
+ var_filter_block2d_avg(src, tmp, src_stride, 1, w, h + padding); \
+ avg_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse, \
+ &sum); \
+ } else { \
+ var_filter_block2d_avg(src, tmp, src_stride, 1, w, h + padding); \
+ bil_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse, &sum, \
+ yoffset); \
+ } \
+ } else { \
+ uint8_t tmp[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ bil_variance_neon_dotprod(src, src_stride, ref, ref_stride, 1, w, h, \
+ sse, &sum, xoffset); \
+ } else if (yoffset == 4) { \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, 1, h + padding, \
+ xoffset); \
+ avg_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse, \
+ &sum); \
+ } else { \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, 1, h + padding, \
+ xoffset); \
+ bil_variance_neon_dotprod(tmp, w, ref, ref_stride, w, w, h, sse, &sum, \
+ yoffset); \
+ } \
+ } \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+SUBPEL_VARIANCE_4XH_NEON_DOTPROD(8, 2)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(8, 4, 5, 1)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(8, 8, 6, 1)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(8, 16, 7, 1)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 8, 7, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 32, 9, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(32, 16, 9, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(32, 32, 10, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(32, 64, 11, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(64, 32, 11, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(64, 64, 12, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(64, 128, 13, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(128, 64, 13, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(128, 128, 14, 1)
+
+#if !CONFIG_REALTIME_ONLY
+SUBPEL_VARIANCE_4XH_NEON_DOTPROD(16, 2)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(8, 32, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 4, 6, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(16, 64, 10, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(32, 8, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD(64, 16, 10, 1)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_VARIANCE_WXH_NEON_DOTPROD
+#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON_DOTPROD
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 2a1bf4f165..ff5caeb35e 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -3674,6 +3674,35 @@ const VarianceParams kArrayVariance_neon_dotprod[] = {
INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, AvxVarianceTest,
::testing::ValuesIn(kArrayVariance_neon_dotprod));
+const SubpelVarianceParams kArraySubpelVariance_neon_dotprod[] = {
+ SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_neon_dotprod, 0),
+ SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_neon_dotprod, 0),
+ SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_neon_dotprod, 0),
+ SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon_dotprod, 0),
+ SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_neon_dotprod, 0),
+ SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_neon_dotprod, 0),
+ SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon_dotprod, 0),
+ SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_neon_dotprod, 0),
+ SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_neon_dotprod, 0),
+ SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon_dotprod, 0),
+ SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_neon_dotprod, 0),
+ SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_neon_dotprod, 0),
+ SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon_dotprod, 0),
+ SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_neon_dotprod, 0),
+ SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_neon_dotprod, 0),
+#if !CONFIG_REALTIME_ONLY
+ SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_neon_dotprod, 0),
+ SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_neon_dotprod, 0),
+ SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_neon_dotprod, 0),
+ SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_neon_dotprod, 0),
+ SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_neon_dotprod, 0),
+ SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_neon_dotprod, 0),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, AvxSubpelVarianceTest,
+ ::testing::ValuesIn(kArraySubpelVariance_neon_dotprod));
+
const GetSseSumParams kArrayGetSseSum8x8Quad_neon_dotprod[] = {
GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),
GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_neon_dotprod, 0),