Commit 6319130bae for aom
commit 6319130bae5075f25a821f59df6cdb4e2c6a6b2c
Author: Li Zhang <li.zhang2@arm.com>
Date: Mon Feb 23 11:46:33 2026 +0100
Make direct calls to Neon variance paths in Neon subpel variance
Now that we have Neon DotProd paths for subpel variance as well as
variance, we no longer need to have an indirect call to the variance
function in Armv8.0 Neon subpel variance paths. Make a direct call to
the Neon variance path in these Neon subpel functions.
This is a port from SVT-AV1:
https://gitlab.com/AOMediaCodec/SVT-AV1/-/merge_requests/2608
Originally authored by: Jonathan Wright <Jonathan.Wright@arm.com>
Change-Id: I16e2541bc293708a342c84efe95b47b086725b11
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 2ba6d386b9..4e74737156 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -30,61 +30,62 @@
var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
xoffset); \
var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
}
-#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
- unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, unsigned int *sse) { \
- if (xoffset == 0) { \
- if (yoffset == 0) { \
- return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
- } else if (yoffset == 4) { \
- uint8_t tmp[w * h]; \
- var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
- return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
- } else { \
- uint8_t tmp[w * h]; \
- var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
- yoffset); \
- return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
- } \
- } else if (xoffset == 4) { \
- uint8_t tmp0[w * (h + padding)]; \
- if (yoffset == 0) { \
- var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
- return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
- } else if (yoffset == 4) { \
- uint8_t tmp1[w * (h + padding)]; \
- var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
- var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
- return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
- } else { \
- uint8_t tmp1[w * (h + padding)]; \
- var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
- var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
- } \
- } else { \
- uint8_t tmp0[w * (h + padding)]; \
- if (yoffset == 0) { \
- var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
- return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
- } else if (yoffset == 4) { \
- uint8_t tmp1[w * h]; \
- var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
- xoffset); \
- var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
- return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
- } else { \
- uint8_t tmp1[w * h]; \
- var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
- xoffset); \
- var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
- } \
- } \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+ sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
+ return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
+ yoffset); \
+ return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
}
SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)