Commit e44e42dc3c for aom
commit e44e42dc3c93458fca1b1543c6088bc6b60475a1
Author: Jerome Jiang <jianj@google.com>
Date: Wed May 13 12:50:49 2026 -0400
highway: convolve 1d vert for AVX2 and AVX512
Guard legacy avx2 code with !CONFIG_HIGHWAY
| Size | CUR AVX2 | HWY AVX2 | HWY 512 |
|:--------|:--------|:---------|:--------|
| 4x4 | 5.691µs | 3.385µs | 3.348µs |
| 8x4 | 5.639µs | 3.592µs | 3.631µs |
| 4x8 | 6.953µs | 4.839µs | 4.656µs |
| 8x8 | 6.476µs | 4.631µs | 4.827µs |
| 16x8 | 7.163µs | 5.304µs | 5.386µs |
| 8x16 | 9.304µs | 7.429µs | 7.549µs |
| 16x16 | 10.93µs | 10.16µs | 10.27µs |
| 32x16 | 19.80µs | 22.07µs | 13.56µs |
| 16x32 | 18.32µs | 19.06µs | 19.10µs |
| 32x32 | 34.64µs | 40.20µs | 25.96µs |
| 64x32 | 69.56µs | 79.88µs | 51.01µs |
| 32x64 | 175.2µs | 171.3µs | 86.84µs |
| 64x64 | 313.2µs | 323.0µs | 239.5µs |
| 128x64 | 534.1µs | 495.3µs | 511.8µs |
| 64x128 | 680.2µs | 662.5µs | 414.4µs |
| 128x128 | 1.357ms | 1.326ms | 803.3µs |
Change-Id: I9a6030336377c105658b232380153e0bfaaa33dd
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index b80ee54a97..a09a4f23f8 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -102,8 +102,11 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
"${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h")
if(CONFIG_HIGHWAY)
+ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/convolve_vert_hwy_avx2.cc")
list(APPEND AOM_DSP_COMMON_INTRIN_AVX512
- "${AOM_ROOT}/aom_dsp/x86/convolve_hwy_avx512.cc")
+ "${AOM_ROOT}/aom_dsp/x86/convolve_hwy_avx512.cc"
+ "${AOM_ROOT}/aom_dsp/x86/convolve_vert_hwy_avx512.cc")
endif()
list(APPEND AOM_DSP_COMMON_INTRIN_NEON
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a3b3564704..e5fb5dcb66 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -530,6 +530,11 @@ specialize qw/aom_convolve_copy neon sse2 avx2/;
specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3";
specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3";
+if (aom_config("CONFIG_HIGHWAY") eq "yes") {
+ specialize qw/aom_convolve8_horiz avx512/;
+ specialize qw/aom_convolve8_vert avx512/;
+}
+
add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/aom_scaled_2d ssse3 neon neon_dotprod neon_i8mm/;
diff --git a/aom_dsp/convolve_hwy.h b/aom_dsp/convolve_hwy.h
index 58e18b0f72..e5be37ec9f 100644
--- a/aom_dsp/convolve_hwy.h
+++ b/aom_dsp/convolve_hwy.h
@@ -27,14 +27,19 @@ namespace hn = hwy::HWY_NAMESPACE;
template <typename D>
HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned4x4(D tag16, const uint8_t *buf,
ptrdiff_t stride) {
- HWY_ALIGN int16_t buf_to_array[16];
- for (int i = 0; i < 4; ++i) {
- for (int j = 0; j < 4; ++j) {
- buf_to_array[i * 4 + j] = buf[j];
- }
- buf += stride;
- }
- return hn::Load(tag16, buf_to_array);
+ hn::CappedTag<uint32_t, 4> tag32;
+ uint32_t r0, r1, r2, r3;
+ memcpy(&r0, buf, 4);
+ memcpy(&r1, buf + stride, 4);
+ memcpy(&r2, buf + 2 * stride, 4);
+ memcpy(&r3, buf + 3 * stride, 4);
+ auto v32 = hn::Zero(tag32);
+ v32 = hn::InsertLane(v32, 0, r0);
+ v32 = hn::InsertLane(v32, 1, r1);
+ v32 = hn::InsertLane(v32, 2, r2);
+ v32 = hn::InsertLane(v32, 3, r3);
+ hn::Rebind<uint8_t, D> tag8;
+ return hn::PromoteTo(tag16, hn::BitCast(tag8, v32));
}
template <typename D>
@@ -43,7 +48,9 @@ HWY_ATTR HWY_INLINE void StoreUnaligned4x4(D tag16, uint8_t *buf,
hn::VFromD<D> &vec) {
(void)tag16;
hn::Rebind<uint8_t, D> tag8;
- auto vec_demoted = hn::DemoteTo(tag8, vec);
+ constexpr hn::Half<D> half_tag16;
+ auto vec_demoted = hn::ReorderDemote2To(tag8, hn::LowerHalf(half_tag16, vec),
+ hn::UpperHalf(half_tag16, vec));
constexpr hn::Half<decltype(tag8)> half_tag;
constexpr hn::Half<decltype(half_tag)> quarter_tag;
auto vec1_2 = hn::LowerHalf(half_tag, vec_demoted);
@@ -74,7 +81,9 @@ HWY_ATTR HWY_INLINE void StoreUnaligned2x8(D tag, uint8_t *buf,
hn::VFromD<D> &vec) {
(void)tag;
hn::Rebind<uint8_t, D> tag8;
- auto vec_demoted = hn::DemoteTo(tag8, vec);
+ constexpr hn::Half<D> half_tag16;
+ auto vec_demoted = hn::ReorderDemote2To(tag8, hn::LowerHalf(half_tag16, vec),
+ hn::UpperHalf(half_tag16, vec));
constexpr hn::Half<decltype(tag8)> half_tag8;
auto vec1_2 = hn::UpperHalf(half_tag8, vec_demoted);
auto vec2_2 = hn::LowerHalf(half_tag8, vec_demoted);
@@ -105,7 +114,9 @@ HWY_ATTR HWY_INLINE void StoreUnaligned4x8(D tag, uint8_t *buf,
hn::VFromD<D> &vec) {
(void)tag;
hn::Rebind<uint8_t, D> tag8;
- auto vec_demoted = hn::DemoteTo(tag8, vec);
+ constexpr hn::Half<D> half_tag16;
+ auto vec_demoted = hn::ReorderDemote2To(tag8, hn::LowerHalf(half_tag16, vec),
+ hn::UpperHalf(half_tag16, vec));
constexpr hn::Half<decltype(tag8)> half_tag8;
constexpr hn::Half<decltype(half_tag8)> quarter_tag8;
auto vec1_2 = hn::UpperHalf(half_tag8, vec_demoted);
@@ -379,6 +390,371 @@ HWY_ATTR inline void ConvolveHoriz8Tap(const uint8_t *src, ptrdiff_t src_stride,
}
}
+HWY_ATTR inline void ConvolveVert2Tap(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_y, int w, int h) {
+ hn::CappedTag<int16_t, 16> tag16;
+ hn::Rebind<uint8_t, decltype(tag16)> pixel_tag;
+
+ auto f0 = hn::Set(tag16, filter_y[3]);
+ auto f1 = hn::Set(tag16, filter_y[4]);
+ auto round_offset = hn::Set(tag16, 1 << (FILTER_BITS - 1));
+
+ if (w == 4) {
+ for (int y = 0; y < h; y += 4) {
+ auto s0 = LoadUnaligned4x4(tag16, src + y * src_stride, src_stride);
+ auto s1 = LoadUnaligned4x4(tag16, src + (y + 1) * src_stride, src_stride);
+ auto res = hn::ShiftRight<FILTER_BITS>(s0 * f0 + s1 * f1 + round_offset);
+ StoreUnaligned4x4(tag16, dst + y * dst_stride, dst_stride, res);
+ }
+ } else if (w == 8) {
+ for (int y = 0; y < h; y += 4) {
+ auto s0 = LoadUnaligned2x8(tag16, src + y * src_stride, src_stride);
+ auto s1 = LoadUnaligned2x8(tag16, src + (y + 1) * src_stride, src_stride);
+ auto s2 = LoadUnaligned2x8(tag16, src + (y + 2) * src_stride, src_stride);
+ auto s3 = LoadUnaligned2x8(tag16, src + (y + 3) * src_stride, src_stride);
+ auto res0 = hn::ShiftRight<FILTER_BITS>(s0 * f0 + s1 * f1 + round_offset);
+ auto res1 = hn::ShiftRight<FILTER_BITS>(s2 * f0 + s3 * f1 + round_offset);
+ StoreUnaligned2x8(tag16, dst + y * dst_stride, dst_stride, res0);
+ StoreUnaligned2x8(tag16, dst + (y + 2) * dst_stride, dst_stride, res1);
+ }
+ } else if (w == 16) {
+ constexpr hn::Half<decltype(tag16)> half_tag16;
+ auto s0 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src));
+ for (int y = 0; y < h; y += 4) {
+ auto s1 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 1) * src_stride));
+ auto s2 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 2) * src_stride));
+ auto s3 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 3) * src_stride));
+ auto s4 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 4) * src_stride));
+
+ auto res0 = hn::ShiftRight<FILTER_BITS>(s0 * f0 + s1 * f1 + round_offset);
+ auto res1 = hn::ShiftRight<FILTER_BITS>(s1 * f0 + s2 * f1 + round_offset);
+ auto res2 = hn::ShiftRight<FILTER_BITS>(s2 * f0 + s3 * f1 + round_offset);
+ auto res3 = hn::ShiftRight<FILTER_BITS>(s3 * f0 + s4 * f1 + round_offset);
+
+ hn::StoreU(
+ hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res0),
+ hn::UpperHalf(half_tag16, res0)),
+ pixel_tag, dst + y * dst_stride);
+ hn::StoreU(
+ hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res1),
+ hn::UpperHalf(half_tag16, res1)),
+ pixel_tag, dst + (y + 1) * dst_stride);
+ hn::StoreU(
+ hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res2),
+ hn::UpperHalf(half_tag16, res2)),
+ pixel_tag, dst + (y + 2) * dst_stride);
+ hn::StoreU(
+ hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res3),
+ hn::UpperHalf(half_tag16, res3)),
+ pixel_tag, dst + (y + 3) * dst_stride);
+
+ s0 = s4;
+ }
+ } else {
+ hn::ScalableTag<int16_t> mul_tag;
+ hn::Rebind<uint8_t, decltype(mul_tag)> p_tag;
+ auto f0_s = hn::Set(mul_tag, filter_y[3]);
+ auto f1_s = hn::Set(mul_tag, filter_y[4]);
+ auto round_offset_s = hn::Set(mul_tag, 1 << (FILTER_BITS - 1));
+ auto vw = hn::Lanes(mul_tag);
+ for (int x = 0; x < w; x += vw) {
+ auto s0 = hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x));
+ for (int y = 0; y < h; ++y) {
+ auto s1 = hn::PromoteTo(
+ mul_tag, hn::LoadU(p_tag, src + x + (y + 1) * src_stride));
+ auto res =
+ hn::ShiftRight<FILTER_BITS>(s0 * f0_s + s1 * f1_s + round_offset_s);
+ auto res_demoted = hn::DemoteTo(p_tag, res);
+ if (x + static_cast<int>(vw) > w) {
+ hn::StoreN(res_demoted, p_tag, dst + x + y * dst_stride, w - x);
+ } else {
+ hn::StoreU(res_demoted, p_tag, dst + x + y * dst_stride);
+ }
+ s0 = s1;
+ }
+ }
+ }
+}
+
+HWY_ATTR inline void ConvolveVert4Tap(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_y, int w, int h) {
+ hn::CappedTag<int16_t, 16> tag16;
+ hn::Rebind<uint8_t, decltype(tag16)> pixel_tag;
+
+ auto f0 = hn::Set(tag16, filter_y[2] >> 1);
+ auto f1 = hn::Set(tag16, filter_y[3] >> 1);
+ auto f2 = hn::Set(tag16, filter_y[4] >> 1);
+ auto f3 = hn::Set(tag16, filter_y[5] >> 1);
+ auto round_offset = hn::Set(tag16, 1 << (FILTER_BITS - 2));
+
+ if (w == 4) {
+ for (int y = 0; y < h; y += 4) {
+ auto s0 = LoadUnaligned4x4(tag16, src + y * src_stride, src_stride);
+ auto s1 = LoadUnaligned4x4(tag16, src + (y + 1) * src_stride, src_stride);
+ auto s2 = LoadUnaligned4x4(tag16, src + (y + 2) * src_stride, src_stride);
+ auto s3 = LoadUnaligned4x4(tag16, src + (y + 3) * src_stride, src_stride);
+ auto res = hn::ShiftRight<FILTER_BITS - 1>(s0 * f0 + s1 * f1 + s2 * f2 +
+ s3 * f3 + round_offset);
+ StoreUnaligned4x4(tag16, dst + y * dst_stride, dst_stride, res);
+ }
+ } else if (w == 8) {
+ auto s0 = LoadUnaligned2x8(tag16, src + 0 * src_stride, src_stride);
+ auto s1 = LoadUnaligned2x8(tag16, src + 1 * src_stride, src_stride);
+ for (int y = 0; y < h; y += 4) {
+ auto s2 = LoadUnaligned2x8(tag16, src + (y + 2) * src_stride, src_stride);
+ auto s3 = LoadUnaligned2x8(tag16, src + (y + 3) * src_stride, src_stride);
+ auto s4 = LoadUnaligned2x8(tag16, src + (y + 4) * src_stride, src_stride);
+ auto s5 = LoadUnaligned2x8(tag16, src + (y + 5) * src_stride, src_stride);
+ auto res0 = hn::ShiftRight<FILTER_BITS - 1>(s0 * f0 + s1 * f1 + s2 * f2 +
+ s3 * f3 + round_offset);
+ auto res1 = hn::ShiftRight<FILTER_BITS - 1>(s2 * f0 + s3 * f1 + s4 * f2 +
+ s5 * f3 + round_offset);
+ StoreUnaligned2x8(tag16, dst + y * dst_stride, dst_stride, res0);
+ StoreUnaligned2x8(tag16, dst + (y + 2) * dst_stride, dst_stride, res1);
+ s0 = s4;
+ s1 = s5;
+ }
+ } else if (w == 16) {
+ constexpr hn::Half<decltype(tag16)> half_tag16;
+ auto s0 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 0 * src_stride));
+ auto s1 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 1 * src_stride));
+ auto s2 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 2 * src_stride));
+ for (int y = 0; y < h; y += 4) {
+ auto s3 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 3) * src_stride));
+ auto s4 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 4) * src_stride));
+ auto s5 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 5) * src_stride));
+ auto s6 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 6) * src_stride));
+
+ auto res0 = hn::ShiftRight<FILTER_BITS - 1>(s0 * f0 + s1 * f1 + s2 * f2 +
+ s3 * f3 + round_offset);
+ auto res1 = hn::ShiftRight<FILTER_BITS - 1>(s1 * f0 + s2 * f1 + s3 * f2 +
+ s4 * f3 + round_offset);
+ auto res2 = hn::ShiftRight<FILTER_BITS - 1>(s2 * f0 + s3 * f1 + s4 * f2 +
+ s5 * f3 + round_offset);
+ auto res3 = hn::ShiftRight<FILTER_BITS - 1>(s3 * f0 + s4 * f1 + s5 * f2 +
+ s6 * f3 + round_offset);
+
+ hn::StoreU(
+ hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res0),
+ hn::UpperHalf(half_tag16, res0)),
+ pixel_tag, dst + y * dst_stride);
+ hn::StoreU(
+ hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res1),
+ hn::UpperHalf(half_tag16, res1)),
+ pixel_tag, dst + (y + 1) * dst_stride);
+ hn::StoreU(
+ hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res2),
+ hn::UpperHalf(half_tag16, res2)),
+ pixel_tag, dst + (y + 2) * dst_stride);
+ hn::StoreU(
+ hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res3),
+ hn::UpperHalf(half_tag16, res3)),
+ pixel_tag, dst + (y + 3) * dst_stride);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ }
+ } else {
+ hn::ScalableTag<int16_t> mul_tag;
+ hn::Rebind<uint8_t, decltype(mul_tag)> p_tag;
+ auto f0_s = hn::Set(mul_tag, filter_y[2] >> 1);
+ auto f1_s = hn::Set(mul_tag, filter_y[3] >> 1);
+ auto f2_s = hn::Set(mul_tag, filter_y[4] >> 1);
+ auto f3_s = hn::Set(mul_tag, filter_y[5] >> 1);
+ auto round_offset_s = hn::Set(mul_tag, 1 << (FILTER_BITS - 2));
+ auto vw = hn::Lanes(mul_tag);
+ for (int x = 0; x < w; x += vw) {
+ auto s0 = hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x));
+ auto s1 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 1 * src_stride));
+ auto s2 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 2 * src_stride));
+ for (int y = 0; y < h; ++y) {
+ auto s3 = hn::PromoteTo(
+ mul_tag, hn::LoadU(p_tag, src + x + (y + 3) * src_stride));
+ auto res = hn::ShiftRight<FILTER_BITS - 1>(
+ s0 * f0_s + s1 * f1_s + s2 * f2_s + s3 * f3_s + round_offset_s);
+ auto res_demoted = hn::DemoteTo(p_tag, res);
+ if (x + static_cast<int>(vw) > w) {
+ hn::StoreN(res_demoted, p_tag, dst + x + y * dst_stride, w - x);
+ } else {
+ hn::StoreU(res_demoted, p_tag, dst + x + y * dst_stride);
+ }
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ }
+ }
+ }
+}
+
+HWY_ATTR inline void ConvolveVert8Tap(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_y, int w, int h) {
+ hn::CappedTag<int16_t, 16> tag16;
+ hn::Rebind<uint8_t, decltype(tag16)> pixel_tag;
+
+ auto f0 = hn::Set(tag16, filter_y[0] >> 1);
+ auto f1 = hn::Set(tag16, filter_y[1] >> 1);
+ auto f2 = hn::Set(tag16, filter_y[2] >> 1);
+ auto f3 = hn::Set(tag16, filter_y[3] >> 1);
+ auto f4 = hn::Set(tag16, filter_y[4] >> 1);
+ auto f5 = hn::Set(tag16, filter_y[5] >> 1);
+ auto f6 = hn::Set(tag16, filter_y[6] >> 1);
+ auto f7 = hn::Set(tag16, filter_y[7] >> 1);
+ auto round_offset = hn::Set(tag16, 1 << (FILTER_BITS - 2));
+
+ if (w == 4) {
+ for (int y = 0; y < h; y += 4) {
+ auto s0 = LoadUnaligned4x4(tag16, src + (y + 0) * src_stride, src_stride);
+ auto s1 = LoadUnaligned4x4(tag16, src + (y + 1) * src_stride, src_stride);
+ auto s2 = LoadUnaligned4x4(tag16, src + (y + 2) * src_stride, src_stride);
+ auto s3 = LoadUnaligned4x4(tag16, src + (y + 3) * src_stride, src_stride);
+ auto s4 = LoadUnaligned4x4(tag16, src + (y + 4) * src_stride, src_stride);
+ auto s5 = LoadUnaligned4x4(tag16, src + (y + 5) * src_stride, src_stride);
+ auto s6 = LoadUnaligned4x4(tag16, src + (y + 6) * src_stride, src_stride);
+ auto s7 = LoadUnaligned4x4(tag16, src + (y + 7) * src_stride, src_stride);
+ auto res = hn::ShiftRight<FILTER_BITS - 1>(
+ s0 * f0 + s1 * f1 + s2 * f2 + s3 * f3 + s4 * f4 + s5 * f5 + s6 * f6 +
+ s7 * f7 + round_offset);
+ StoreUnaligned4x4(tag16, dst + y * dst_stride, dst_stride, res);
+ }
+ } else if (w == 8) {
+ auto s0 = LoadUnaligned2x8(tag16, src + 0 * src_stride, src_stride);
+ auto s1 = LoadUnaligned2x8(tag16, src + 1 * src_stride, src_stride);
+ auto s2 = LoadUnaligned2x8(tag16, src + 2 * src_stride, src_stride);
+ auto s3 = LoadUnaligned2x8(tag16, src + 3 * src_stride, src_stride);
+ auto s4 = LoadUnaligned2x8(tag16, src + 4 * src_stride, src_stride);
+ auto s5 = LoadUnaligned2x8(tag16, src + 5 * src_stride, src_stride);
+ for (int y = 0; y < h; y += 2) {
+ auto s6 = LoadUnaligned2x8(tag16, src + (y + 6) * src_stride, src_stride);
+ auto s7 = LoadUnaligned2x8(tag16, src + (y + 7) * src_stride, src_stride);
+ auto res = hn::ShiftRight<FILTER_BITS - 1>(
+ s0 * f0 + s1 * f1 + s2 * f2 + s3 * f3 + s4 * f4 + s5 * f5 + s6 * f6 +
+ s7 * f7 + round_offset);
+ StoreUnaligned2x8(tag16, dst + y * dst_stride, dst_stride, res);
+ s0 = s2;
+ s1 = s3;
+ s2 = s4;
+ s3 = s5;
+ s4 = s6;
+ s5 = s7;
+ }
+ } else if (w == 16) {
+ constexpr hn::Half<decltype(tag16)> half_tag16;
+ auto s0 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 0 * src_stride));
+ auto s1 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 1 * src_stride));
+ auto s2 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 2 * src_stride));
+ auto s3 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 3 * src_stride));
+ auto s4 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 4 * src_stride));
+ auto s5 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 5 * src_stride));
+ auto s6 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 6 * src_stride));
+ for (int y = 0; y < h; ++y) {
+ auto s7 = hn::PromoteTo(tag16,
+ hn::LoadU(pixel_tag, src + (y + 7) * src_stride));
+ auto res = hn::ShiftRight<FILTER_BITS - 1>(
+ s0 * f0 + s1 * f1 + s2 * f2 + s3 * f3 + s4 * f4 + s5 * f5 + s6 * f6 +
+ s7 * f7 + round_offset);
+ hn::StoreU(hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res),
+ hn::UpperHalf(half_tag16, res)),
+ pixel_tag, dst + y * dst_stride);
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ }
+ } else {
+ hn::ScalableTag<int16_t> mul_tag;
+ hn::Rebind<uint8_t, decltype(mul_tag)> p_tag;
+ auto f0_s = hn::Set(mul_tag, filter_y[0] >> 1);
+ auto f1_s = hn::Set(mul_tag, filter_y[1] >> 1);
+ auto f2_s = hn::Set(mul_tag, filter_y[2] >> 1);
+ auto f3_s = hn::Set(mul_tag, filter_y[3] >> 1);
+ auto f4_s = hn::Set(mul_tag, filter_y[4] >> 1);
+ auto f5_s = hn::Set(mul_tag, filter_y[5] >> 1);
+ auto f6_s = hn::Set(mul_tag, filter_y[6] >> 1);
+ auto f7_s = hn::Set(mul_tag, filter_y[7] >> 1);
+ auto round_offset_s = hn::Set(mul_tag, 1 << (FILTER_BITS - 2));
+ auto vw = hn::Lanes(mul_tag);
+ for (int x = 0; x < w; x += vw) {
+ auto s0 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 0 * src_stride));
+ auto s1 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 1 * src_stride));
+ auto s2 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 2 * src_stride));
+ auto s3 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 3 * src_stride));
+ auto s4 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 4 * src_stride));
+ auto s5 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 5 * src_stride));
+ auto s6 =
+ hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 6 * src_stride));
+ for (int y = 0; y < h; ++y) {
+ auto s7 = hn::PromoteTo(
+ mul_tag, hn::LoadU(p_tag, src + x + (y + 7) * src_stride));
+ auto sum = s0 * f0_s + s1 * f1_s + s2 * f2_s + s3 * f3_s + s4 * f4_s +
+ s5 * f5_s + s6 * f6_s + s7 * f7_s;
+ auto res = hn::ShiftRight<FILTER_BITS - 1>(sum + round_offset_s);
+ auto res_demoted = hn::DemoteTo(p_tag, res);
+ if (x + static_cast<int>(vw) > w) {
+ hn::StoreN(res_demoted, p_tag, dst + x + y * dst_stride, w - x);
+ } else {
+ hn::StoreU(res_demoted, p_tag, dst + x + y * dst_stride);
+ }
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ }
+ }
+ }
+}
+
+HWY_MAYBE_UNUSED void Convolve8Vert(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_x;
+ (void)y_step_q4;
+
+ src -= src_stride * ((SUBPEL_TAPS / 2) - 1);
+ int filter_taps = get_filter_taps_convolve8(filter_y);
+ if (filter_taps == 2) {
+ ConvolveVert2Tap(src + src_stride * 3, src_stride, dst, dst_stride,
+ filter_y, w, h);
+ } else if (filter_taps == 4) {
+ ConvolveVert4Tap(src + src_stride * 2, src_stride, dst, dst_stride,
+ filter_y, w, h);
+ } else {
+ // filter_taps = 8
+ ConvolveVert8Tap(src, src_stride, dst, dst_stride, filter_y, w, h);
+ }
+}
+
HWY_MAYBE_UNUSED void Convolve8Horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index e8e94a42c9..c46d146138 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -49,12 +49,6 @@ static inline void xx_storeu2_epi32(const uint8_t *output_ptr,
_mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
}
-static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
- __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
- a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
- return a;
-}
-
static inline void xx_storeu2_epi64(const uint8_t *output_ptr,
const ptrdiff_t stride, const __m256i *a) {
_mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
@@ -790,6 +784,14 @@ static void aom_filter_block1d16_h8_avx2(
}
}
+#if !CONFIG_HIGHWAY
+
+static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+ return a;
+}
+
static void aom_filter_block1d8_v4_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -1400,33 +1402,39 @@ static void aom_filter_block1d4_v4_avx2(
srcReg4x = srcReg6x;
}
}
+#endif // !CONFIG_HIGHWAY
#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
-#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
-#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
-#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
+
// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+
+#if !CONFIG_HIGHWAY
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
+#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
+#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
+#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
+
// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
+#endif // !CONFIG_HIGHWAY
#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/aom_dsp/x86/convolve_vert_hwy_avx2.cc b/aom_dsp/x86/convolve_vert_hwy_avx2.cc
new file mode 100644
index 0000000000..e9288331d4
--- /dev/null
+++ b/aom_dsp/x86/convolve_vert_hwy_avx2.cc
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define HWY_BASELINE_TARGETS HWY_AVX2
+#define HWY_BROKEN_32BIT 0
+
+#include "aom_dsp/convolve_hwy.h"
+
+extern "C" void aom_convolve8_vert_avx2(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+HWY_ATTR void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ HWY_NAMESPACE::Convolve8Vert(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+}
diff --git a/aom_dsp/x86/convolve_vert_hwy_avx512.cc b/aom_dsp/x86/convolve_vert_hwy_avx512.cc
new file mode 100644
index 0000000000..0c6105eb8a
--- /dev/null
+++ b/aom_dsp/x86/convolve_vert_hwy_avx512.cc
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define HWY_BASELINE_TARGETS HWY_AVX3_DL
+#define HWY_BROKEN_32BIT 0
+
+#include "aom_dsp/convolve_hwy.h"
+
+extern "C" void aom_convolve8_vert_avx512(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h);
+
+HWY_ATTR void aom_convolve8_vert_avx512(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ HWY_NAMESPACE::Convolve8Vert(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+}
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 23db6d70ca..2d1e12cd09 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -884,8 +884,13 @@ extern "C" void aom_convolve8_horiz_avx512(
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w, int h);
+extern "C" void aom_convolve8_vert_avx512(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h);
+
const ConvolveFunctions convolve8_avx512(aom_convolve8_horiz_avx512,
- aom_convolve8_vert_c, 0);
+ aom_convolve8_vert_avx512, 0);
const ConvolveParam kArray_Convolve8_avx512[] = { ALL_SIZES(convolve8_avx512) };
INSTANTIATE_TEST_SUITE_P(AVX512, LowbdConvolveTest,