Commit 6cf8bf673d for aom

commit 6cf8bf673d97ce8c006a8c3887336f6087a37eb8
Author: Jerome Jiang <jianj@google.com>
Date:   Fri May 8 17:29:22 2026 -0400

    libaom: Optimize convolve for AVX512

    Optimize aom_convolve8_horiz for AVX512.
    Falls back to AVX2 for widths <= 16 for better performance.

    Large blocks (AVX512 vs AVX2):
    - 32x64: 81.77µ vs 172.3µ
    - 64x64: 154.4µ vs 310.1µ
    - 128x64: 301.5µ vs 528.7µ
    - 64x128: 312.1µ vs 678.6µ
    - 128x128: 600.3µ vs 1.356m

    Change-Id: Icf5d870ccdcc87ab92b407c5ff80d0ca6b3caa93

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 0738274b1b..b80ee54a97 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -46,6 +46,10 @@ list(APPEND AOM_DSP_COMMON_SOURCES
             "${AOM_ROOT}/aom_dsp/txfm_common.h"
             "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")

+if(CONFIG_HIGHWAY)
+  list(APPEND AOM_DSP_COMMON_SOURCES "${AOM_ROOT}/aom_dsp/convolve_hwy.h")
+endif()
+
 list(APPEND AOM_DSP_COMMON_ASM_SSE2
             "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm")
 if(CONFIG_AV1_HIGHBITDEPTH)
@@ -97,6 +101,11 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
             "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
             "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h")

+if(CONFIG_HIGHWAY)
+  list(APPEND AOM_DSP_COMMON_INTRIN_AVX512
+              "${AOM_ROOT}/aom_dsp/x86/convolve_hwy_avx512.cc")
+endif()
+
 list(APPEND AOM_DSP_COMMON_INTRIN_NEON
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
             "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
@@ -493,6 +502,12 @@ function(setup_aom_dsp_targets)
                                   "AOM_DSP_ENCODER_INTRIN_AVX512")
   endif()

+  if(HAVE_AVX512 AND CONFIG_HIGHWAY)
+    add_intrinsics_object_library("-march=skylake-avx512" "avx512"
+                                  "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_AVX512")
+  endif()
+
   if(HAVE_NEON)
     add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
                                   "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON")
diff --git a/aom_dsp/convolve_hwy.h b/aom_dsp/convolve_hwy.h
new file mode 100644
index 0000000000..58e18b0f72
--- /dev/null
+++ b/aom_dsp/convolve_hwy.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_CONVOLVE_HWY_H_
+#define AOM_AOM_DSP_CONVOLVE_HWY_H_
+
+#include <cassert>
+
+#include "aom_dsp/arm/aom_filter.h"
+#include "third_party/highway/hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+
+namespace {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+template <typename D>
+HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned4x4(D tag16, const uint8_t *buf,
+                                                   ptrdiff_t stride) {
+  HWY_ALIGN int16_t buf_to_array[16];
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      buf_to_array[i * 4 + j] = buf[j];
+    }
+    buf += stride;
+  }
+  return hn::Load(tag16, buf_to_array);
+}
+
+template <typename D>
+HWY_ATTR HWY_INLINE void StoreUnaligned4x4(D tag16, uint8_t *buf,
+                                           ptrdiff_t stride,
+                                           hn::VFromD<D> &vec) {
+  (void)tag16;
+  hn::Rebind<uint8_t, D> tag8;
+  auto vec_demoted = hn::DemoteTo(tag8, vec);
+  constexpr hn::Half<decltype(tag8)> half_tag;
+  constexpr hn::Half<decltype(half_tag)> quarter_tag;
+  auto vec1_2 = hn::LowerHalf(half_tag, vec_demoted);
+  auto vec2_2 = hn::UpperHalf(half_tag, vec_demoted);
+  auto vec1_4 = hn::LowerHalf(quarter_tag, vec1_2);
+  auto vec2_4 = hn::UpperHalf(quarter_tag, vec1_2);
+  auto vec3_4 = hn::LowerHalf(quarter_tag, vec2_2);
+  auto vec4_4 = hn::UpperHalf(quarter_tag, vec2_2);
+  hn::StoreU(vec1_4, quarter_tag, buf);
+  hn::StoreU(vec2_4, quarter_tag, buf + stride);
+  hn::StoreU(vec3_4, quarter_tag, buf + 2 * stride);
+  hn::StoreU(vec4_4, quarter_tag, buf + 3 * stride);
+}
+
+template <typename D>
+HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned2x8(D tag16, const uint8_t *buf,
+                                                   ptrdiff_t stride) {
+  hn::Rebind<uint8_t, D> tag8;
+  constexpr hn::Half<decltype(tag8)> half_tag8;
+  auto first_half = hn::LoadU(half_tag8, buf);
+  auto second_half = hn::LoadU(half_tag8, buf + stride);
+  return hn::PromoteTo(tag16, hn::Combine(tag8, first_half, second_half));
+}
+
+template <typename D>
+HWY_ATTR HWY_INLINE void StoreUnaligned2x8(D tag, uint8_t *buf,
+                                           ptrdiff_t stride,
+                                           hn::VFromD<D> &vec) {
+  (void)tag;
+  hn::Rebind<uint8_t, D> tag8;
+  auto vec_demoted = hn::DemoteTo(tag8, vec);
+  constexpr hn::Half<decltype(tag8)> half_tag8;
+  auto vec1_2 = hn::UpperHalf(half_tag8, vec_demoted);
+  auto vec2_2 = hn::LowerHalf(half_tag8, vec_demoted);
+  hn::StoreU(vec1_2, half_tag8, buf);
+  hn::StoreU(vec2_2, half_tag8, buf + stride);
+}
+
+template <typename D>
+HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned4x8(D scalable_tag,
+                                                   const uint8_t *buf,
+                                                   ptrdiff_t stride) {
+  hn::Rebind<uint8_t, D> tag8;
+  constexpr hn::Half<decltype(tag8)> half_tag8;
+  constexpr hn::Half<decltype(half_tag8)> quarter_tag8;
+  auto first_quarter = hn::LoadU(quarter_tag8, buf);
+  auto second_quarter = hn::LoadU(quarter_tag8, buf + stride);
+  auto third_quarter = hn::LoadU(quarter_tag8, buf + 2 * stride);
+  auto fourth_quarter = hn::LoadU(quarter_tag8, buf + 3 * stride);
+  return hn::PromoteTo(
+      scalable_tag,
+      hn::Combine(tag8, hn::Combine(half_tag8, first_quarter, second_quarter),
+                  hn::Combine(half_tag8, third_quarter, fourth_quarter)));
+}
+
+template <typename D>
+HWY_ATTR HWY_INLINE void StoreUnaligned4x8(D tag, uint8_t *buf,
+                                           ptrdiff_t stride,
+                                           hn::VFromD<D> &vec) {
+  (void)tag;
+  hn::Rebind<uint8_t, D> tag8;
+  auto vec_demoted = hn::DemoteTo(tag8, vec);
+  constexpr hn::Half<decltype(tag8)> half_tag8;
+  constexpr hn::Half<decltype(half_tag8)> quarter_tag8;
+  auto vec1_2 = hn::UpperHalf(half_tag8, vec_demoted);
+  auto vec2_2 = hn::LowerHalf(half_tag8, vec_demoted);
+  auto vec1_4 = hn::UpperHalf(quarter_tag8, vec1_2);
+  auto vec2_4 = hn::LowerHalf(quarter_tag8, vec1_2);
+  auto vec3_4 = hn::UpperHalf(quarter_tag8, vec2_2);
+  auto vec4_4 = hn::LowerHalf(quarter_tag8, vec2_2);
+  hn::StoreU(vec1_4, quarter_tag8, buf);
+  hn::StoreU(vec2_4, quarter_tag8, buf + stride);
+  hn::StoreU(vec3_4, quarter_tag8, buf + 2 * stride);
+  hn::StoreU(vec4_4, quarter_tag8, buf + 3 * stride);
+}
+
+HWY_ATTR inline void ConvolveHoriz2Tap(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const int16_t *filter_x, int w, int h) {
+  hn::ScalableTag<int16_t> mul_tag;
+  hn::Rebind<uint8_t, decltype(mul_tag)> pixel_tag;
+  auto filter_0 = hn::Set(mul_tag, filter_x[3]);
+  auto filter_1 = hn::Set(mul_tag, filter_x[4]);
+  auto vw = hn::Lanes(mul_tag);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += vw) {
+      auto src0 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j]));
+      auto src1 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j + 1]));
+      auto mulv = hn::RoundingShiftRight<FILTER_BITS>(src0 * filter_0 +
+                                                      src1 * filter_1);
+      auto mulv_demoted = hn::DemoteTo(pixel_tag, mulv);
+      if (j + static_cast<int>(vw) > w) {
+        hn::StoreN(mulv_demoted, pixel_tag, &dst[j], w - j);
+      } else {
+        hn::StoreU(mulv_demoted, pixel_tag, &dst[j]);
+      }
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+template <typename D, typename DFilter>
+HWY_ATTR HWY_INLINE hn::VFromD<D> Convolve4_8(
+    D tag16, DFilter tag_filter, hn::VFromD<D> &s0, hn::VFromD<D> &s1,
+    hn::VFromD<D> &s2, hn::VFromD<D> &s3, hn::VFromD<DFilter> &filter) {
+  (void)tag_filter;
+  auto mul0 = hn::Mul(s0, hn::Set(tag16, hn::ExtractLane(filter, 0)));
+  auto mul1 = hn::Mul(s1, hn::Set(tag16, hn::ExtractLane(filter, 1)));
+  auto mul2 = hn::Mul(s2, hn::Set(tag16, hn::ExtractLane(filter, 2)));
+  auto mul3 = hn::Mul(s3, hn::Set(tag16, hn::ExtractLane(filter, 3)));
+
+  auto res = mul0 + mul1 + mul2 + mul3;
+  // Shift (FILTER_BITS - 1) because filter values were halved.
+  return hn::RoundingShiftRight<FILTER_BITS - 1>(res);
+}
+
+HWY_ATTR inline void ConvolveHoriz4Tap(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const int16_t *filter_x, int w, int h) {
+  hn::CappedTag<int16_t, 16> tag16;
+  hn::CappedTag<int16_t, 4> filter_tag;
+  auto f_vec = hn::LoadU(filter_tag, filter_x + 2);
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  f_vec = hn::ShiftRight<1>(f_vec);
+
+  if (w == 4) {
+    // Each iteration processes a 4x4 block
+    do {
+      auto src0 = LoadUnaligned4x4(tag16, src, src_stride);
+      auto src1 = LoadUnaligned4x4(tag16, src + 1, src_stride);
+      auto src2 = LoadUnaligned4x4(tag16, src + 2, src_stride);
+      auto src3 = LoadUnaligned4x4(tag16, src + 3, src_stride);
+      auto result =
+          Convolve4_8(tag16, filter_tag, src0, src1, src2, src3, f_vec);
+      StoreUnaligned4x4(tag16, dst, dst_stride, result);
+      h -= 4;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+    } while (h > 0);
+  } else if (w == 8) {
+    // Each iteration processes a 2x8 block
+    do {
+      auto src0 = LoadUnaligned2x8(tag16, src, src_stride);
+      auto src1 = LoadUnaligned2x8(tag16, src + 1, src_stride);
+      auto src2 = LoadUnaligned2x8(tag16, src + 2, src_stride);
+      auto src3 = LoadUnaligned2x8(tag16, src + 3, src_stride);
+      auto result =
+          Convolve4_8(tag16, filter_tag, src0, src1, src2, src3, f_vec);
+      StoreUnaligned2x8(tag16, dst, dst_stride, result);
+      h -= 2;
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+    } while (h > 0);
+  } else if (w == 16) {
+    // One 1x16 block a time
+    do {
+      hn::Rebind<uint8_t, decltype(tag16)> tag8;
+      auto src0 = hn::PromoteTo(tag16, hn::LoadU(tag8, src));
+      auto src1 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 1));
+      auto src2 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 2));
+      auto src3 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 3));
+      auto result =
+          Convolve4_8(tag16, filter_tag, src0, src1, src2, src3, f_vec);
+      hn::StoreU(hn::DemoteTo(tag8, result), tag8, dst);
+      h--;
+      src += src_stride;
+      dst += dst_stride;
+    } while (h > 0);
+  } else {
+    hn::ScalableTag<int16_t> mul_tag;
+    hn::Rebind<uint8_t, decltype(mul_tag)> pixel_tag;
+    auto vw = hn::Lanes(mul_tag);
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += vw) {
+        auto src0 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j]));
+        auto src1 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j + 1]));
+        auto src2 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j + 2]));
+        auto src3 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j + 3]));
+        auto result =
+            Convolve4_8(mul_tag, filter_tag, src0, src1, src2, src3, f_vec);
+        auto result_demoted = hn::DemoteTo(pixel_tag, result);
+        if (j + static_cast<int>(vw) > w) {
+          hn::StoreN(result_demoted, pixel_tag, &dst[j], w - j);
+        } else {
+          hn::StoreU(result_demoted, pixel_tag, &dst[j]);
+        }
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+template <typename D, typename DFilter>
+HWY_ATTR HWY_INLINE hn::VFromD<D> Convolve8_8(
+    D tag16, DFilter tag_filter, hn::VFromD<D> &s0, hn::VFromD<D> &s1,
+    hn::VFromD<D> &s2, hn::VFromD<D> &s3, hn::VFromD<D> &s4, hn::VFromD<D> &s5,
+    hn::VFromD<D> &s6, hn::VFromD<D> &s7, hn::VFromD<DFilter> &filter) {
+  (void)tag_filter;
+  auto filter_0 = hn::ExtractLane(filter, 0);
+  auto filter_1 = hn::ExtractLane(filter, 1);
+  auto filter_2 = hn::ExtractLane(filter, 2);
+  auto filter_3 = hn::ExtractLane(filter, 3);
+  auto filter_4 = hn::ExtractLane(filter, 4);
+  auto filter_5 = hn::ExtractLane(filter, 5);
+  auto filter_6 = hn::ExtractLane(filter, 6);
+  auto filter_7 = hn::ExtractLane(filter, 7);
+  auto mul0 = hn::Mul(s0, hn::Set(tag16, filter_0));
+  auto mul1 = hn::Mul(s1, hn::Set(tag16, filter_1));
+  auto mul2 = hn::Mul(s2, hn::Set(tag16, filter_2));
+  auto mul3 = hn::Mul(s3, hn::Set(tag16, filter_3));
+  auto mul4 = hn::Mul(s4, hn::Set(tag16, filter_4));
+  auto mul5 = hn::Mul(s5, hn::Set(tag16, filter_5));
+  auto mul6 = hn::Mul(s6, hn::Set(tag16, filter_6));
+  auto mul7 = hn::Mul(s7, hn::Set(tag16, filter_7));
+
+  auto res = mul0 + mul1 + mul2 + mul3 + mul4 + mul5 + mul6 + mul7;
+  // Shift (FILTER_BITS - 1) because filter values were halved.
+  return hn::RoundingShiftRight<FILTER_BITS - 1>(res);
+}
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_global[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+HWY_ATTR inline void ConvolveHoriz8Tap(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const int16_t *filter_x, int w, int h) {
+  hn::CappedTag<int16_t, 16> tag16;
+  hn::CappedTag<int16_t, 8> filter_tag;
+  auto f_vec = hn::LoadU(filter_tag, filter_x);
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  f_vec = hn::ShiftRight<1>(f_vec);
+
+  if (w == 4) {
+    do {
+      auto src0 = LoadUnaligned4x4(tag16, src, src_stride);
+      auto src1 = LoadUnaligned4x4(tag16, src + 1, src_stride);
+      auto src2 = LoadUnaligned4x4(tag16, src + 2, src_stride);
+      auto src3 = LoadUnaligned4x4(tag16, src + 3, src_stride);
+      auto src4 = LoadUnaligned4x4(tag16, src + 4, src_stride);
+      auto src5 = LoadUnaligned4x4(tag16, src + 5, src_stride);
+      auto src6 = LoadUnaligned4x4(tag16, src + 6, src_stride);
+      auto src7 = LoadUnaligned4x4(tag16, src + 7, src_stride);
+      auto result = Convolve8_8(tag16, filter_tag, src0, src1, src2, src3, src4,
+                                src5, src6, src7, f_vec);
+      StoreUnaligned4x4(tag16, dst, dst_stride, result);
+      h -= 4;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+    } while (h > 0);
+  } else if (w == 8) {
+    // Each iteration processes a 2x8 block
+    do {
+      auto src0 = LoadUnaligned2x8(tag16, src, src_stride);
+      auto src1 = LoadUnaligned2x8(tag16, src + 1, src_stride);
+      auto src2 = LoadUnaligned2x8(tag16, src + 2, src_stride);
+      auto src3 = LoadUnaligned2x8(tag16, src + 3, src_stride);
+      auto src4 = LoadUnaligned2x8(tag16, src + 4, src_stride);
+      auto src5 = LoadUnaligned2x8(tag16, src + 5, src_stride);
+      auto src6 = LoadUnaligned2x8(tag16, src + 6, src_stride);
+      auto src7 = LoadUnaligned2x8(tag16, src + 7, src_stride);
+      auto result = Convolve8_8(tag16, filter_tag, src0, src1, src2, src3, src4,
+                                src5, src6, src7, f_vec);
+      StoreUnaligned2x8(tag16, dst, dst_stride, result);
+      h -= 2;
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+    } while (h > 0);
+  } else if (w == 16) {
+    // One 1x16 block a time
+    do {
+      hn::Rebind<uint8_t, decltype(tag16)> tag8;
+      auto src0 = hn::PromoteTo(tag16, hn::LoadU(tag8, src));
+      auto src1 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 1));
+      auto src2 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 2));
+      auto src3 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 3));
+      auto src4 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 4));
+      auto src5 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 5));
+      auto src6 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 6));
+      auto src7 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 7));
+      auto result = Convolve8_8(tag16, filter_tag, src0, src1, src2, src3, src4,
+                                src5, src6, src7, f_vec);
+      hn::StoreU(hn::DemoteTo(tag8, result), tag8, dst);
+      h--;
+      src += src_stride;
+      dst += dst_stride;
+    } while (h > 0);
+  } else {
+    // This tag will have 32 lanes (for avx512) or 16 lanes (for avx2)
+    hn::ScalableTag<int16_t> mul_tag;
+    hn::Rebind<uint8_t, decltype(mul_tag)> pixel_tag;
+    auto vw = hn::Lanes(mul_tag);
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += vw) {
+        auto s0 = hn::LoadU(pixel_tag, &src[j]);
+        auto s1 = hn::LoadU(pixel_tag, &src[j + 1]);
+        auto s2 = hn::LoadU(pixel_tag, &src[j + 2]);
+        auto s3 = hn::LoadU(pixel_tag, &src[j + 3]);
+        auto s4 = hn::LoadU(pixel_tag, &src[j + 4]);
+        auto s5 = hn::LoadU(pixel_tag, &src[j + 5]);
+        auto s6 = hn::LoadU(pixel_tag, &src[j + 6]);
+        auto s7 = hn::LoadU(pixel_tag, &src[j + 7]);
+        auto src0 = hn::PromoteTo(mul_tag, s0);
+        auto src1 = hn::PromoteTo(mul_tag, s1);
+        auto src2 = hn::PromoteTo(mul_tag, s2);
+        auto src3 = hn::PromoteTo(mul_tag, s3);
+        auto src4 = hn::PromoteTo(mul_tag, s4);
+        auto src5 = hn::PromoteTo(mul_tag, s5);
+        auto src6 = hn::PromoteTo(mul_tag, s6);
+        auto src7 = hn::PromoteTo(mul_tag, s7);
+        auto result = Convolve8_8(mul_tag, filter_tag, src0, src1, src2, src3,
+                                  src4, src5, src6, src7, f_vec);
+        auto result_demoted = hn::DemoteTo(pixel_tag, result);
+        if (j + static_cast<int>(vw) > w) {
+          hn::StoreN(result_demoted, pixel_tag, &dst[j], w - j);
+        } else {
+          hn::StoreU(result_demoted, pixel_tag, &dst[j]);
+        }
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+HWY_MAYBE_UNUSED void Convolve8Horiz(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+  if (filter_taps == 2) {
+    ConvolveHoriz2Tap(src + 3, src_stride, dst, dst_stride, filter_x, w, h);
+  } else if (filter_taps == 4) {
+    ConvolveHoriz4Tap(src + 2, src_stride, dst, dst_stride, filter_x, w, h);
+  } else {
+    // filter_taps = 8
+    ConvolveHoriz8Tap(src, src_stride, dst, dst_stride, filter_x, w, h);
+  }
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace
+
+#define CONVOLVE8HORIZ(suffix)                                                \
+  extern "C" void aom_convolve8_horiz_##suffix(                               \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
+      const int16_t *filter_y, int y_step_q4, int w, int h);                  \
+  HWY_ATTR void aom_convolve8_horiz_##suffix(                                 \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {                 \
+    HWY_NAMESPACE::Convolve8Horiz(src, src_stride, dst, dst_stride, filter_x, \
+                                  x_step_q4, filter_y, y_step_q4, w, h);      \
+  }
+
+HWY_AFTER_NAMESPACE();
+
+#endif  // AOM_AOM_DSP_CONVOLVE_HWY_H_
\ No newline at end of file
diff --git a/aom_dsp/x86/convolve_hwy_avx512.cc b/aom_dsp/x86/convolve_hwy_avx512.cc
new file mode 100644
index 0000000000..c1aa90492f
--- /dev/null
+++ b/aom_dsp/x86/convolve_hwy_avx512.cc
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define HWY_BASELINE_TARGETS HWY_AVX3_DL
+#define HWY_BROKEN_32BIT 0
+
+#include "aom_dsp/convolve_hwy.h"
+
+extern "C" void aom_convolve8_horiz_avx2(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const int16_t *filter_x, int x_step_q4,
+                                         const int16_t *filter_y, int y_step_q4,
+                                         int w, int h);
+
+extern "C" void aom_convolve8_horiz_avx512(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h);
+
+HWY_ATTR void aom_convolve8_horiz_avx512(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const int16_t *filter_x, int x_step_q4,
+                                         const int16_t *filter_y, int y_step_q4,
+                                         int w, int h) {
+  // Fallback to AVX2 for small block sizes (w <= 16) where the handwritten
+  // AVX2 implementation was measured to be faster than the Highway AVX512
+  // implementation in benchmarks.
+  if (w <= 16) {
+    aom_convolve8_horiz_avx2(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    HWY_NAMESPACE::Convolve8Horiz(src, src_stride, dst, dst_stride, filter_x,
+                                  x_step_q4, filter_y, y_step_q4, w, h);
+  }
+}
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 578d9a2614..23db6d70ca 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -875,8 +875,23 @@ const ConvolveParam kArray_Convolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };

 INSTANTIATE_TEST_SUITE_P(AVX2, LowbdConvolveTest,
                          ::testing::ValuesIn(kArray_Convolve8_avx2));
+
 #endif  // HAVE_AVX2

+#if HAVE_AVX512 && CONFIG_HIGHWAY
+extern "C" void aom_convolve8_horiz_avx512(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h);
+
+const ConvolveFunctions convolve8_avx512(aom_convolve8_horiz_avx512,
+                                         aom_convolve8_vert_c, 0);
+const ConvolveParam kArray_Convolve8_avx512[] = { ALL_SIZES(convolve8_avx512) };
+
+INSTANTIATE_TEST_SUITE_P(AVX512, LowbdConvolveTest,
+                         ::testing::ValuesIn(kArray_Convolve8_avx512));
+#endif  // HAVE_AVX512
+
 #if HAVE_NEON
 #if CONFIG_AV1_HIGHBITDEPTH
 const ConvolveFunctions wrap_convolve8_neon(wrap_convolve8_horiz_neon_8,