Commit e44e42dc3c for aom

commit e44e42dc3c93458fca1b1543c6088bc6b60475a1
Author: Jerome Jiang <jianj@google.com>
Date:   Wed May 13 12:50:49 2026 -0400

    highway: convolve 1d vert for AVX2 and AVX512

    Guard legacy avx2 code with !CONFIG_HIGHWAY

        | Size    | CUR AVX2 | HWY AVX2 | HWY 512 |
        |:--------|:--------|:---------|:--------|
        | 4x4     | 5.691µs | 3.385µs  | 3.348µs |
        | 8x4     | 5.639µs | 3.592µs  | 3.631µs |
        | 4x8     | 6.953µs | 4.839µs  | 4.656µs |
        | 8x8     | 6.476µs | 4.631µs  | 4.827µs |
        | 16x8    | 7.163µs | 5.304µs  | 5.386µs |
        | 8x16    | 9.304µs | 7.429µs  | 7.549µs |
        | 16x16   | 10.93µs | 10.16µs  | 10.27µs |
        | 32x16   | 19.80µs | 22.07µs  | 13.56µs |
        | 16x32   | 18.32µs | 19.06µs  | 19.10µs |
        | 32x32   | 34.64µs | 40.20µs  | 25.96µs |
        | 64x32   | 69.56µs | 79.88µs  | 51.01µs |
        | 32x64   | 175.2µs | 171.3µs  | 86.84µs |
        | 64x64   | 313.2µs | 323.0µs  | 239.5µs |
        | 128x64  | 534.1µs | 495.3µs  | 511.8µs |
        | 64x128  | 680.2µs | 662.5µs  | 414.4µs |
        | 128x128 | 1.357ms | 1.326ms  | 803.3µs |

    Change-Id: I9a6030336377c105658b232380153e0bfaaa33dd

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index b80ee54a97..a09a4f23f8 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -102,8 +102,11 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h")

 if(CONFIG_HIGHWAY)
+  list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/convolve_vert_hwy_avx2.cc")
   list(APPEND AOM_DSP_COMMON_INTRIN_AVX512
-              "${AOM_ROOT}/aom_dsp/x86/convolve_hwy_avx512.cc")
+              "${AOM_ROOT}/aom_dsp/x86/convolve_hwy_avx512.cc"
+              "${AOM_ROOT}/aom_dsp/x86/convolve_vert_hwy_avx512.cc")
 endif()

 list(APPEND AOM_DSP_COMMON_INTRIN_NEON
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a3b3564704..e5fb5dcb66 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -530,6 +530,11 @@ specialize qw/aom_convolve_copy       neon                        sse2 avx2/;
 specialize qw/aom_convolve8_horiz     neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_vert      neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3";

+if (aom_config("CONFIG_HIGHWAY") eq "yes") {
+  specialize qw/aom_convolve8_horiz     avx512/;
+  specialize qw/aom_convolve8_vert      avx512/;
+}
+
 add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/aom_scaled_2d ssse3 neon neon_dotprod neon_i8mm/;

diff --git a/aom_dsp/convolve_hwy.h b/aom_dsp/convolve_hwy.h
index 58e18b0f72..e5be37ec9f 100644
--- a/aom_dsp/convolve_hwy.h
+++ b/aom_dsp/convolve_hwy.h
@@ -27,14 +27,19 @@ namespace hn = hwy::HWY_NAMESPACE;
 template <typename D>
 HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned4x4(D tag16, const uint8_t *buf,
                                                    ptrdiff_t stride) {
-  HWY_ALIGN int16_t buf_to_array[16];
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      buf_to_array[i * 4 + j] = buf[j];
-    }
-    buf += stride;
-  }
-  return hn::Load(tag16, buf_to_array);
+  hn::CappedTag<uint32_t, 4> tag32;
+  uint32_t r0, r1, r2, r3;
+  memcpy(&r0, buf, 4);
+  memcpy(&r1, buf + stride, 4);
+  memcpy(&r2, buf + 2 * stride, 4);
+  memcpy(&r3, buf + 3 * stride, 4);
+  auto v32 = hn::Zero(tag32);
+  v32 = hn::InsertLane(v32, 0, r0);
+  v32 = hn::InsertLane(v32, 1, r1);
+  v32 = hn::InsertLane(v32, 2, r2);
+  v32 = hn::InsertLane(v32, 3, r3);
+  hn::Rebind<uint8_t, D> tag8;
+  return hn::PromoteTo(tag16, hn::BitCast(tag8, v32));
 }

 template <typename D>
@@ -43,7 +48,9 @@ HWY_ATTR HWY_INLINE void StoreUnaligned4x4(D tag16, uint8_t *buf,
                                            hn::VFromD<D> &vec) {
   (void)tag16;
   hn::Rebind<uint8_t, D> tag8;
-  auto vec_demoted = hn::DemoteTo(tag8, vec);
+  constexpr hn::Half<D> half_tag16;
+  auto vec_demoted = hn::ReorderDemote2To(tag8, hn::LowerHalf(half_tag16, vec),
+                                          hn::UpperHalf(half_tag16, vec));
   constexpr hn::Half<decltype(tag8)> half_tag;
   constexpr hn::Half<decltype(half_tag)> quarter_tag;
   auto vec1_2 = hn::LowerHalf(half_tag, vec_demoted);
@@ -74,7 +81,9 @@ HWY_ATTR HWY_INLINE void StoreUnaligned2x8(D tag, uint8_t *buf,
                                            hn::VFromD<D> &vec) {
   (void)tag;
   hn::Rebind<uint8_t, D> tag8;
-  auto vec_demoted = hn::DemoteTo(tag8, vec);
+  constexpr hn::Half<D> half_tag16;
+  auto vec_demoted = hn::ReorderDemote2To(tag8, hn::LowerHalf(half_tag16, vec),
+                                          hn::UpperHalf(half_tag16, vec));
   constexpr hn::Half<decltype(tag8)> half_tag8;
   auto vec1_2 = hn::UpperHalf(half_tag8, vec_demoted);
   auto vec2_2 = hn::LowerHalf(half_tag8, vec_demoted);
@@ -105,7 +114,9 @@ HWY_ATTR HWY_INLINE void StoreUnaligned4x8(D tag, uint8_t *buf,
                                            hn::VFromD<D> &vec) {
   (void)tag;
   hn::Rebind<uint8_t, D> tag8;
-  auto vec_demoted = hn::DemoteTo(tag8, vec);
+  constexpr hn::Half<D> half_tag16;
+  auto vec_demoted = hn::ReorderDemote2To(tag8, hn::LowerHalf(half_tag16, vec),
+                                          hn::UpperHalf(half_tag16, vec));
   constexpr hn::Half<decltype(tag8)> half_tag8;
   constexpr hn::Half<decltype(half_tag8)> quarter_tag8;
   auto vec1_2 = hn::UpperHalf(half_tag8, vec_demoted);
@@ -379,6 +390,371 @@ HWY_ATTR inline void ConvolveHoriz8Tap(const uint8_t *src, ptrdiff_t src_stride,
   }
 }

+HWY_ATTR inline void ConvolveVert2Tap(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_y, int w, int h) {
+  hn::CappedTag<int16_t, 16> tag16;
+  hn::Rebind<uint8_t, decltype(tag16)> pixel_tag;
+
+  auto f0 = hn::Set(tag16, filter_y[3]);
+  auto f1 = hn::Set(tag16, filter_y[4]);
+  auto round_offset = hn::Set(tag16, 1 << (FILTER_BITS - 1));
+
+  if (w == 4) {
+    for (int y = 0; y < h; y += 4) {
+      auto s0 = LoadUnaligned4x4(tag16, src + y * src_stride, src_stride);
+      auto s1 = LoadUnaligned4x4(tag16, src + (y + 1) * src_stride, src_stride);
+      auto res = hn::ShiftRight<FILTER_BITS>(s0 * f0 + s1 * f1 + round_offset);
+      StoreUnaligned4x4(tag16, dst + y * dst_stride, dst_stride, res);
+    }
+  } else if (w == 8) {
+    for (int y = 0; y < h; y += 4) {
+      auto s0 = LoadUnaligned2x8(tag16, src + y * src_stride, src_stride);
+      auto s1 = LoadUnaligned2x8(tag16, src + (y + 1) * src_stride, src_stride);
+      auto s2 = LoadUnaligned2x8(tag16, src + (y + 2) * src_stride, src_stride);
+      auto s3 = LoadUnaligned2x8(tag16, src + (y + 3) * src_stride, src_stride);
+      auto res0 = hn::ShiftRight<FILTER_BITS>(s0 * f0 + s1 * f1 + round_offset);
+      auto res1 = hn::ShiftRight<FILTER_BITS>(s2 * f0 + s3 * f1 + round_offset);
+      StoreUnaligned2x8(tag16, dst + y * dst_stride, dst_stride, res0);
+      StoreUnaligned2x8(tag16, dst + (y + 2) * dst_stride, dst_stride, res1);
+    }
+  } else if (w == 16) {
+    constexpr hn::Half<decltype(tag16)> half_tag16;
+    auto s0 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src));
+    for (int y = 0; y < h; y += 4) {
+      auto s1 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 1) * src_stride));
+      auto s2 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 2) * src_stride));
+      auto s3 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 3) * src_stride));
+      auto s4 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 4) * src_stride));
+
+      auto res0 = hn::ShiftRight<FILTER_BITS>(s0 * f0 + s1 * f1 + round_offset);
+      auto res1 = hn::ShiftRight<FILTER_BITS>(s1 * f0 + s2 * f1 + round_offset);
+      auto res2 = hn::ShiftRight<FILTER_BITS>(s2 * f0 + s3 * f1 + round_offset);
+      auto res3 = hn::ShiftRight<FILTER_BITS>(s3 * f0 + s4 * f1 + round_offset);
+
+      hn::StoreU(
+          hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res0),
+                               hn::UpperHalf(half_tag16, res0)),
+          pixel_tag, dst + y * dst_stride);
+      hn::StoreU(
+          hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res1),
+                               hn::UpperHalf(half_tag16, res1)),
+          pixel_tag, dst + (y + 1) * dst_stride);
+      hn::StoreU(
+          hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res2),
+                               hn::UpperHalf(half_tag16, res2)),
+          pixel_tag, dst + (y + 2) * dst_stride);
+      hn::StoreU(
+          hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res3),
+                               hn::UpperHalf(half_tag16, res3)),
+          pixel_tag, dst + (y + 3) * dst_stride);
+
+      s0 = s4;
+    }
+  } else {
+    hn::ScalableTag<int16_t> mul_tag;
+    hn::Rebind<uint8_t, decltype(mul_tag)> p_tag;
+    auto f0_s = hn::Set(mul_tag, filter_y[3]);
+    auto f1_s = hn::Set(mul_tag, filter_y[4]);
+    auto round_offset_s = hn::Set(mul_tag, 1 << (FILTER_BITS - 1));
+    auto vw = hn::Lanes(mul_tag);
+    for (int x = 0; x < w; x += vw) {
+      auto s0 = hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x));
+      for (int y = 0; y < h; ++y) {
+        auto s1 = hn::PromoteTo(
+            mul_tag, hn::LoadU(p_tag, src + x + (y + 1) * src_stride));
+        auto res =
+            hn::ShiftRight<FILTER_BITS>(s0 * f0_s + s1 * f1_s + round_offset_s);
+        auto res_demoted = hn::DemoteTo(p_tag, res);
+        if (x + static_cast<int>(vw) > w) {
+          hn::StoreN(res_demoted, p_tag, dst + x + y * dst_stride, w - x);
+        } else {
+          hn::StoreU(res_demoted, p_tag, dst + x + y * dst_stride);
+        }
+        s0 = s1;
+      }
+    }
+  }
+}
+
+HWY_ATTR inline void ConvolveVert4Tap(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_y, int w, int h) {
+  hn::CappedTag<int16_t, 16> tag16;
+  hn::Rebind<uint8_t, decltype(tag16)> pixel_tag;
+
+  auto f0 = hn::Set(tag16, filter_y[2] >> 1);
+  auto f1 = hn::Set(tag16, filter_y[3] >> 1);
+  auto f2 = hn::Set(tag16, filter_y[4] >> 1);
+  auto f3 = hn::Set(tag16, filter_y[5] >> 1);
+  auto round_offset = hn::Set(tag16, 1 << (FILTER_BITS - 2));
+
+  if (w == 4) {
+    for (int y = 0; y < h; y += 4) {
+      auto s0 = LoadUnaligned4x4(tag16, src + y * src_stride, src_stride);
+      auto s1 = LoadUnaligned4x4(tag16, src + (y + 1) * src_stride, src_stride);
+      auto s2 = LoadUnaligned4x4(tag16, src + (y + 2) * src_stride, src_stride);
+      auto s3 = LoadUnaligned4x4(tag16, src + (y + 3) * src_stride, src_stride);
+      auto res = hn::ShiftRight<FILTER_BITS - 1>(s0 * f0 + s1 * f1 + s2 * f2 +
+                                                 s3 * f3 + round_offset);
+      StoreUnaligned4x4(tag16, dst + y * dst_stride, dst_stride, res);
+    }
+  } else if (w == 8) {
+    auto s0 = LoadUnaligned2x8(tag16, src + 0 * src_stride, src_stride);
+    auto s1 = LoadUnaligned2x8(tag16, src + 1 * src_stride, src_stride);
+    for (int y = 0; y < h; y += 4) {
+      auto s2 = LoadUnaligned2x8(tag16, src + (y + 2) * src_stride, src_stride);
+      auto s3 = LoadUnaligned2x8(tag16, src + (y + 3) * src_stride, src_stride);
+      auto s4 = LoadUnaligned2x8(tag16, src + (y + 4) * src_stride, src_stride);
+      auto s5 = LoadUnaligned2x8(tag16, src + (y + 5) * src_stride, src_stride);
+      auto res0 = hn::ShiftRight<FILTER_BITS - 1>(s0 * f0 + s1 * f1 + s2 * f2 +
+                                                  s3 * f3 + round_offset);
+      auto res1 = hn::ShiftRight<FILTER_BITS - 1>(s2 * f0 + s3 * f1 + s4 * f2 +
+                                                  s5 * f3 + round_offset);
+      StoreUnaligned2x8(tag16, dst + y * dst_stride, dst_stride, res0);
+      StoreUnaligned2x8(tag16, dst + (y + 2) * dst_stride, dst_stride, res1);
+      s0 = s4;
+      s1 = s5;
+    }
+  } else if (w == 16) {
+    constexpr hn::Half<decltype(tag16)> half_tag16;
+    auto s0 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 0 * src_stride));
+    auto s1 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 1 * src_stride));
+    auto s2 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 2 * src_stride));
+    for (int y = 0; y < h; y += 4) {
+      auto s3 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 3) * src_stride));
+      auto s4 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 4) * src_stride));
+      auto s5 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 5) * src_stride));
+      auto s6 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 6) * src_stride));
+
+      auto res0 = hn::ShiftRight<FILTER_BITS - 1>(s0 * f0 + s1 * f1 + s2 * f2 +
+                                                  s3 * f3 + round_offset);
+      auto res1 = hn::ShiftRight<FILTER_BITS - 1>(s1 * f0 + s2 * f1 + s3 * f2 +
+                                                  s4 * f3 + round_offset);
+      auto res2 = hn::ShiftRight<FILTER_BITS - 1>(s2 * f0 + s3 * f1 + s4 * f2 +
+                                                  s5 * f3 + round_offset);
+      auto res3 = hn::ShiftRight<FILTER_BITS - 1>(s3 * f0 + s4 * f1 + s5 * f2 +
+                                                  s6 * f3 + round_offset);
+
+      hn::StoreU(
+          hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res0),
+                               hn::UpperHalf(half_tag16, res0)),
+          pixel_tag, dst + y * dst_stride);
+      hn::StoreU(
+          hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res1),
+                               hn::UpperHalf(half_tag16, res1)),
+          pixel_tag, dst + (y + 1) * dst_stride);
+      hn::StoreU(
+          hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res2),
+                               hn::UpperHalf(half_tag16, res2)),
+          pixel_tag, dst + (y + 2) * dst_stride);
+      hn::StoreU(
+          hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res3),
+                               hn::UpperHalf(half_tag16, res3)),
+          pixel_tag, dst + (y + 3) * dst_stride);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+    }
+  } else {
+    hn::ScalableTag<int16_t> mul_tag;
+    hn::Rebind<uint8_t, decltype(mul_tag)> p_tag;
+    auto f0_s = hn::Set(mul_tag, filter_y[2] >> 1);
+    auto f1_s = hn::Set(mul_tag, filter_y[3] >> 1);
+    auto f2_s = hn::Set(mul_tag, filter_y[4] >> 1);
+    auto f3_s = hn::Set(mul_tag, filter_y[5] >> 1);
+    auto round_offset_s = hn::Set(mul_tag, 1 << (FILTER_BITS - 2));
+    auto vw = hn::Lanes(mul_tag);
+    for (int x = 0; x < w; x += vw) {
+      auto s0 = hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x));
+      auto s1 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 1 * src_stride));
+      auto s2 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 2 * src_stride));
+      for (int y = 0; y < h; ++y) {
+        auto s3 = hn::PromoteTo(
+            mul_tag, hn::LoadU(p_tag, src + x + (y + 3) * src_stride));
+        auto res = hn::ShiftRight<FILTER_BITS - 1>(
+            s0 * f0_s + s1 * f1_s + s2 * f2_s + s3 * f3_s + round_offset_s);
+        auto res_demoted = hn::DemoteTo(p_tag, res);
+        if (x + static_cast<int>(vw) > w) {
+          hn::StoreN(res_demoted, p_tag, dst + x + y * dst_stride, w - x);
+        } else {
+          hn::StoreU(res_demoted, p_tag, dst + x + y * dst_stride);
+        }
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+      }
+    }
+  }
+}
+
+HWY_ATTR inline void ConvolveVert8Tap(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_y, int w, int h) {
+  hn::CappedTag<int16_t, 16> tag16;
+  hn::Rebind<uint8_t, decltype(tag16)> pixel_tag;
+
+  auto f0 = hn::Set(tag16, filter_y[0] >> 1);
+  auto f1 = hn::Set(tag16, filter_y[1] >> 1);
+  auto f2 = hn::Set(tag16, filter_y[2] >> 1);
+  auto f3 = hn::Set(tag16, filter_y[3] >> 1);
+  auto f4 = hn::Set(tag16, filter_y[4] >> 1);
+  auto f5 = hn::Set(tag16, filter_y[5] >> 1);
+  auto f6 = hn::Set(tag16, filter_y[6] >> 1);
+  auto f7 = hn::Set(tag16, filter_y[7] >> 1);
+  auto round_offset = hn::Set(tag16, 1 << (FILTER_BITS - 2));
+
+  if (w == 4) {
+    for (int y = 0; y < h; y += 4) {
+      auto s0 = LoadUnaligned4x4(tag16, src + (y + 0) * src_stride, src_stride);
+      auto s1 = LoadUnaligned4x4(tag16, src + (y + 1) * src_stride, src_stride);
+      auto s2 = LoadUnaligned4x4(tag16, src + (y + 2) * src_stride, src_stride);
+      auto s3 = LoadUnaligned4x4(tag16, src + (y + 3) * src_stride, src_stride);
+      auto s4 = LoadUnaligned4x4(tag16, src + (y + 4) * src_stride, src_stride);
+      auto s5 = LoadUnaligned4x4(tag16, src + (y + 5) * src_stride, src_stride);
+      auto s6 = LoadUnaligned4x4(tag16, src + (y + 6) * src_stride, src_stride);
+      auto s7 = LoadUnaligned4x4(tag16, src + (y + 7) * src_stride, src_stride);
+      auto res = hn::ShiftRight<FILTER_BITS - 1>(
+          s0 * f0 + s1 * f1 + s2 * f2 + s3 * f3 + s4 * f4 + s5 * f5 + s6 * f6 +
+          s7 * f7 + round_offset);
+      StoreUnaligned4x4(tag16, dst + y * dst_stride, dst_stride, res);
+    }
+  } else if (w == 8) {
+    auto s0 = LoadUnaligned2x8(tag16, src + 0 * src_stride, src_stride);
+    auto s1 = LoadUnaligned2x8(tag16, src + 1 * src_stride, src_stride);
+    auto s2 = LoadUnaligned2x8(tag16, src + 2 * src_stride, src_stride);
+    auto s3 = LoadUnaligned2x8(tag16, src + 3 * src_stride, src_stride);
+    auto s4 = LoadUnaligned2x8(tag16, src + 4 * src_stride, src_stride);
+    auto s5 = LoadUnaligned2x8(tag16, src + 5 * src_stride, src_stride);
+    for (int y = 0; y < h; y += 2) {
+      auto s6 = LoadUnaligned2x8(tag16, src + (y + 6) * src_stride, src_stride);
+      auto s7 = LoadUnaligned2x8(tag16, src + (y + 7) * src_stride, src_stride);
+      auto res = hn::ShiftRight<FILTER_BITS - 1>(
+          s0 * f0 + s1 * f1 + s2 * f2 + s3 * f3 + s4 * f4 + s5 * f5 + s6 * f6 +
+          s7 * f7 + round_offset);
+      StoreUnaligned2x8(tag16, dst + y * dst_stride, dst_stride, res);
+      s0 = s2;
+      s1 = s3;
+      s2 = s4;
+      s3 = s5;
+      s4 = s6;
+      s5 = s7;
+    }
+  } else if (w == 16) {
+    constexpr hn::Half<decltype(tag16)> half_tag16;
+    auto s0 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 0 * src_stride));
+    auto s1 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 1 * src_stride));
+    auto s2 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 2 * src_stride));
+    auto s3 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 3 * src_stride));
+    auto s4 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 4 * src_stride));
+    auto s5 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 5 * src_stride));
+    auto s6 = hn::PromoteTo(tag16, hn::LoadU(pixel_tag, src + 6 * src_stride));
+    for (int y = 0; y < h; ++y) {
+      auto s7 = hn::PromoteTo(tag16,
+                              hn::LoadU(pixel_tag, src + (y + 7) * src_stride));
+      auto res = hn::ShiftRight<FILTER_BITS - 1>(
+          s0 * f0 + s1 * f1 + s2 * f2 + s3 * f3 + s4 * f4 + s5 * f5 + s6 * f6 +
+          s7 * f7 + round_offset);
+      hn::StoreU(hn::ReorderDemote2To(pixel_tag, hn::LowerHalf(half_tag16, res),
+                                      hn::UpperHalf(half_tag16, res)),
+                 pixel_tag, dst + y * dst_stride);
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+    }
+  } else {
+    hn::ScalableTag<int16_t> mul_tag;
+    hn::Rebind<uint8_t, decltype(mul_tag)> p_tag;
+    auto f0_s = hn::Set(mul_tag, filter_y[0] >> 1);
+    auto f1_s = hn::Set(mul_tag, filter_y[1] >> 1);
+    auto f2_s = hn::Set(mul_tag, filter_y[2] >> 1);
+    auto f3_s = hn::Set(mul_tag, filter_y[3] >> 1);
+    auto f4_s = hn::Set(mul_tag, filter_y[4] >> 1);
+    auto f5_s = hn::Set(mul_tag, filter_y[5] >> 1);
+    auto f6_s = hn::Set(mul_tag, filter_y[6] >> 1);
+    auto f7_s = hn::Set(mul_tag, filter_y[7] >> 1);
+    auto round_offset_s = hn::Set(mul_tag, 1 << (FILTER_BITS - 2));
+    auto vw = hn::Lanes(mul_tag);
+    for (int x = 0; x < w; x += vw) {
+      auto s0 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 0 * src_stride));
+      auto s1 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 1 * src_stride));
+      auto s2 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 2 * src_stride));
+      auto s3 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 3 * src_stride));
+      auto s4 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 4 * src_stride));
+      auto s5 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 5 * src_stride));
+      auto s6 =
+          hn::PromoteTo(mul_tag, hn::LoadU(p_tag, src + x + 6 * src_stride));
+      for (int y = 0; y < h; ++y) {
+        auto s7 = hn::PromoteTo(
+            mul_tag, hn::LoadU(p_tag, src + x + (y + 7) * src_stride));
+        auto sum = s0 * f0_s + s1 * f1_s + s2 * f2_s + s3 * f3_s + s4 * f4_s +
+                   s5 * f5_s + s6 * f6_s + s7 * f7_s;
+        auto res = hn::ShiftRight<FILTER_BITS - 1>(sum + round_offset_s);
+        auto res_demoted = hn::DemoteTo(p_tag, res);
+        if (x + static_cast<int>(vw) > w) {
+          hn::StoreN(res_demoted, p_tag, dst + x + y * dst_stride, w - x);
+        } else {
+          hn::StoreU(res_demoted, p_tag, dst + x + y * dst_stride);
+        }
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+      }
+    }
+  }
+}
+
+HWY_MAYBE_UNUSED void Convolve8Vert(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x, int x_step_q4,
+                                    const int16_t *filter_y, int y_step_q4,
+                                    int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_x;
+  (void)y_step_q4;
+
+  src -= src_stride * ((SUBPEL_TAPS / 2) - 1);
+  int filter_taps = get_filter_taps_convolve8(filter_y);
+  if (filter_taps == 2) {
+    ConvolveVert2Tap(src + src_stride * 3, src_stride, dst, dst_stride,
+                     filter_y, w, h);
+  } else if (filter_taps == 4) {
+    ConvolveVert4Tap(src + src_stride * 2, src_stride, dst, dst_stride,
+                     filter_y, w, h);
+  } else {
+    // filter_taps = 8
+    ConvolveVert8Tap(src, src_stride, dst, dst_stride, filter_y, w, h);
+  }
+}
+
 HWY_MAYBE_UNUSED void Convolve8Horiz(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index e8e94a42c9..c46d146138 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -49,12 +49,6 @@ static inline void xx_storeu2_epi32(const uint8_t *output_ptr,
       _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
 }

-static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
-  return a;
-}
-
 static inline void xx_storeu2_epi64(const uint8_t *output_ptr,
                                     const ptrdiff_t stride, const __m256i *a) {
   _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
@@ -790,6 +784,14 @@ static void aom_filter_block1d16_h8_avx2(
   }
 }

+#if !CONFIG_HIGHWAY
+
+static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+  return a;
+}
+
 static void aom_filter_block1d8_v4_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
@@ -1400,33 +1402,39 @@ static void aom_filter_block1d4_v4_avx2(
     srcReg4x = srcReg6x;
   }
 }
+#endif  // !CONFIG_HIGHWAY

 #if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
-#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
 #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
-#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
 #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
-#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
 #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
+
 // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
 //                                const int16_t *filter_x, int x_step_q4,
 //                                const int16_t *filter_y, int y_step_q4,
 //                                int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+
+#if !CONFIG_HIGHWAY
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
+#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
+#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
+#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
+
 // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
 //                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
+#endif  // !CONFIG_HIGHWAY

 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/aom_dsp/x86/convolve_vert_hwy_avx2.cc b/aom_dsp/x86/convolve_vert_hwy_avx2.cc
new file mode 100644
index 0000000000..e9288331d4
--- /dev/null
+++ b/aom_dsp/x86/convolve_vert_hwy_avx2.cc
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define HWY_BASELINE_TARGETS HWY_AVX2
+#define HWY_BROKEN_32BIT 0
+
+#include "aom_dsp/convolve_hwy.h"
+
+extern "C" void aom_convolve8_vert_avx2(const uint8_t *src,
+                                        ptrdiff_t src_stride, uint8_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const int16_t *filter_x, int x_step_q4,
+                                        const int16_t *filter_y, int y_step_q4,
+                                        int w, int h);
+
+HWY_ATTR void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h) {
+  HWY_NAMESPACE::Convolve8Vert(src, src_stride, dst, dst_stride, filter_x,
+                               x_step_q4, filter_y, y_step_q4, w, h);
+}
diff --git a/aom_dsp/x86/convolve_vert_hwy_avx512.cc b/aom_dsp/x86/convolve_vert_hwy_avx512.cc
new file mode 100644
index 0000000000..0c6105eb8a
--- /dev/null
+++ b/aom_dsp/x86/convolve_vert_hwy_avx512.cc
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define HWY_BASELINE_TARGETS HWY_AVX3_DL
+#define HWY_BROKEN_32BIT 0
+
+#include "aom_dsp/convolve_hwy.h"
+
+extern "C" void aom_convolve8_vert_avx512(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h);
+
+HWY_ATTR void aom_convolve8_vert_avx512(const uint8_t *src,
+                                        ptrdiff_t src_stride, uint8_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const int16_t *filter_x, int x_step_q4,
+                                        const int16_t *filter_y, int y_step_q4,
+                                        int w, int h) {
+  HWY_NAMESPACE::Convolve8Vert(src, src_stride, dst, dst_stride, filter_x,
+                               x_step_q4, filter_y, y_step_q4, w, h);
+}
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 23db6d70ca..2d1e12cd09 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -884,8 +884,13 @@ extern "C" void aom_convolve8_horiz_avx512(
     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
     const int16_t *filter_y, int y_step_q4, int w, int h);

+extern "C" void aom_convolve8_vert_avx512(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h);
+
 const ConvolveFunctions convolve8_avx512(aom_convolve8_horiz_avx512,
-                                         aom_convolve8_vert_c, 0);
+                                         aom_convolve8_vert_avx512, 0);
 const ConvolveParam kArray_Convolve8_avx512[] = { ALL_SIZES(convolve8_avx512) };

 INSTANTIATE_TEST_SUITE_P(AVX512, LowbdConvolveTest,