Commit b9714769f9 for aom
commit b9714769f98164731a9c85e30a76e119829e96cc
Author: Jerome Jiang <jianj@google.com>
Date: Tue Jun 2 17:42:03 2026 -0400
Replace native AVX2 convolve horiz with Highway
Map the Highway convolve implementation directly under the
native `_avx2` suffix when CONFIG_HIGHWAY is enabled, replacing
the hand-written legacy AVX2 assembly convolve.
Fallback to native SSSE3 assembly for the 16x32 block size
Size | Leg AVX2 | Hwy AVX2 | Hwy AV512 | AVX2 % | AV512 %
--------+----------+----------+-----------+--------+--------
4x4 | 5.25µs | 4.63µs | 4.43µs | +11.8% | +15.6%
8x4 | 5.32µs | 4.62µs | 4.42µs | +13.1% | +16.9%
4x8 | 7.17µs | 5.77µs | 5.58µs | +19.5% | +22.1%
8x8 | 6.62µs | 5.77µs | 5.57µs | +12.8% | +15.8%
16x8 | 7.49µs | 7.07µs | 6.71µs | +5.6% | +10.4%
8x16 | 9.64µs | 8.23µs | 8.05µs | +14.6% | +16.4%
16x16 | 11.52µs | 11.03µs | 10.86µs | +4.2% | +5.7%
32x16 | 20.93µs | 19.79µs | 19.84µs | +5.4% | +5.2%
16x32 | 19.44µs | 20.54µs | 20.73µs | -5.6% | -6.6%
32x32 | 36.92µs | 33.54µs | 33.41µs | +9.1% | +9.5%
64x32 | 73.25µs | 59.77µs | 59.09µs | +18.4% | +19.3%
32x64 | 171.2µs | 152.6µs | 153.6µs | +10.8% | +10.2%
64x64 | 307.8µs | 254.9µs | 152.3µs | +17.1% | +50.5%
128x64 | 527.4µs | 466.7µs | 299.9µs | +11.5% | +43.1%
64x128 | 679.3µs | 505.2µs | 306.1µs | +25.6% | +54.9%
128x128 | 1.355ms | 937.8µs | 596.6µs | +30.8% | +55.9%
Change-Id: I5956f2eee277310e7aeb4b28ab4c2e496686daa6
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index a09a4f23f8..42914e8b2a 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -103,6 +103,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
if(CONFIG_HIGHWAY)
list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/convolve_hwy_avx2.cc"
"${AOM_ROOT}/aom_dsp/x86/convolve_vert_hwy_avx2.cc")
list(APPEND AOM_DSP_COMMON_INTRIN_AVX512
"${AOM_ROOT}/aom_dsp/x86/convolve_hwy_avx512.cc"
diff --git a/aom_dsp/convolve_hwy.h b/aom_dsp/convolve_hwy.h
index 0b28531038..a93c65b186 100644
--- a/aom_dsp/convolve_hwy.h
+++ b/aom_dsp/convolve_hwy.h
@@ -12,11 +12,20 @@
#ifndef AOM_AOM_DSP_CONVOLVE_HWY_H_
#define AOM_AOM_DSP_CONVOLVE_HWY_H_
+#include "config/aom_config.h"
+
#include <cassert>
#include "aom_dsp/arm/aom_filter.h"
#include "third_party/highway/hwy/highway.h"
+#if HAVE_SSSE3
+extern "C" void aom_convolve8_horiz_ssse3(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h);
+#endif
+
HWY_BEFORE_NAMESPACE();
namespace {
@@ -24,6 +33,107 @@ namespace HWY_NAMESPACE {
namespace hn = hwy::HWY_NAMESPACE;
+template <typename T16_8, typename T8_16, typename V8_16, typename M,
+ typename VC>
+HWY_ATTR HWY_INLINE hn::VFromD<T16_8> ComputeHoriz8TapSum(
+ T16_8 tag16_8, T8_16 tag8_16, const V8_16 &d0, const V8_16 &d2,
+ const V8_16 &d4, const V8_16 &d6, const M &shuffle_mask, const VC &coeff01,
+ const VC &coeff23, const VC &coeff45, const VC &coeff67) {
+ (void)tag8_16;
+ return hn::Add(
+ hn::Add(hn::SatWidenMulPairwiseAdd(
+ tag16_8, hn::TableLookupBytes(d0, shuffle_mask), coeff01),
+ hn::SatWidenMulPairwiseAdd(
+ tag16_8, hn::TableLookupBytes(d2, shuffle_mask), coeff23)),
+ hn::Add(hn::SatWidenMulPairwiseAdd(
+ tag16_8, hn::TableLookupBytes(d4, shuffle_mask), coeff45),
+ hn::SatWidenMulPairwiseAdd(
+ tag16_8, hn::TableLookupBytes(d6, shuffle_mask), coeff67)));
+}
+
+template <typename T16_8, typename T8_16, typename V8_16, typename M,
+ typename VC>
+HWY_ATTR HWY_INLINE hn::VFromD<T16_8> ComputeHoriz4TapSum(
+ T16_8 tag16_8, T8_16 tag8_16, const V8_16 &d0, const V8_16 &d2,
+ const M &shuffle_mask, const VC &coeff23, const VC &coeff45) {
+ (void)tag8_16;
+ return hn::Add(hn::SatWidenMulPairwiseAdd(
+ tag16_8, hn::TableLookupBytes(d0, shuffle_mask), coeff23),
+ hn::SatWidenMulPairwiseAdd(
+ tag16_8, hn::TableLookupBytes(d2, shuffle_mask), coeff45));
+}
+
+template <int chunk_size, typename T8_8, typename T8_4, typename V16_8>
+HWY_ATTR HWY_INLINE void StoreOutputChunk(T8_8 tag8_8, T8_4 tag8_4,
+ const V16_8 &sum,
+ const V16_8 &bias_val, uint8_t *dst) {
+ auto res = hn::ShiftRight<FILTER_BITS - 1>(hn::Add(sum, bias_val));
+ if (chunk_size == 4) {
+ hn::StoreU(hn::LowerHalf(tag8_4, hn::DemoteTo(tag8_8, res)), tag8_4, dst);
+ } else {
+ // chunk_size == 8
+ hn::StoreU(hn::DemoteTo(tag8_8, res), tag8_8, dst);
+ }
+}
+
+template <typename T16_8, typename T8_16, typename M, typename VC>
+HWY_ATTR HWY_INLINE hn::VFromD<T16_8> ComputeChunkSum2Tap(
+ T16_8 tag16_8, T8_16 tag8_16, const uint8_t *src, int j,
+ const M &shuffle_mask, const VC &coeff) {
+ (void)tag8_16;
+ auto d0 = hn::LoadU(tag8_16, src + j);
+ return hn::SatWidenMulPairwiseAdd(
+ tag16_8, hn::TableLookupBytes(d0, shuffle_mask), coeff);
+}
+
+template <typename T16_8, typename T8_16, typename M, typename VC>
+HWY_ATTR HWY_INLINE hn::VFromD<T16_8> ComputeChunkSum4Tap(
+ T16_8 tag16_8, T8_16 tag8_16, const uint8_t *src, int j,
+ const M &shuffle_mask, const VC &coeff0, const VC &coeff1) {
+ auto d0 = hn::LoadU(tag8_16, src + j + 0);
+ auto d2 = hn::LoadU(tag8_16, src + j + 2);
+ return ComputeHoriz4TapSum(tag16_8, tag8_16, d0, d2, shuffle_mask, coeff0,
+ coeff1);
+}
+
+template <typename T16_8, typename T8_16, typename M, typename VC>
+HWY_ATTR HWY_INLINE hn::VFromD<T16_8> ComputeChunkSum8Tap(
+ T16_8 tag16_8, T8_16 tag8_16, const uint8_t *src, int j,
+ const M &shuffle_mask, const VC &coeff0, const VC &coeff1, const VC &coeff2,
+ const VC &coeff3) {
+ auto d0 = hn::LoadU(tag8_16, src + j + 0);
+ auto d2 = hn::LoadU(tag8_16, src + j + 2);
+ auto d4 = hn::LoadU(tag8_16, src + j + 4);
+ auto d6 = hn::LoadU(tag8_16, src + j + 6);
+ return ComputeHoriz8TapSum(tag16_8, tag8_16, d0, d2, d4, d6, shuffle_mask,
+ coeff0, coeff1, coeff2, coeff3);
+}
+
+template <int chunk_size, typename T8_8, typename T8_4, typename V16_8,
+ typename SumComputer>
+HWY_ATTR HWY_INLINE void Process2RowsChunk(T8_8 tag8_8, T8_4 tag8_4,
+ const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int j,
+ const V16_8 &bias_val,
+ SumComputer sum_computer) {
+ auto r0_sum = sum_computer(src, j);
+ auto r1_sum = sum_computer(src + src_stride, j);
+ StoreOutputChunk<chunk_size>(tag8_8, tag8_4, r0_sum, bias_val, dst + j);
+ StoreOutputChunk<chunk_size>(tag8_8, tag8_4, r1_sum, bias_val,
+ dst + dst_stride + j);
+}
+
+template <int chunk_size, typename T8_8, typename T8_4, typename V16_8,
+ typename SumComputer>
+HWY_ATTR HWY_INLINE void Process1RowChunk(T8_8 tag8_8, T8_4 tag8_4,
+ const uint8_t *src, uint8_t *dst,
+ int j, const V16_8 &bias_val,
+ SumComputer sum_computer) {
+ auto r0_sum = sum_computer(src, j);
+ StoreOutputChunk<chunk_size>(tag8_8, tag8_4, r0_sum, bias_val, dst + j);
+}
+
template <typename D>
HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned4x4(D tag16, const uint8_t *buf,
ptrdiff_t stride) {
@@ -134,6 +244,107 @@ HWY_ATTR HWY_INLINE void StoreUnaligned4x8(D tag, uint8_t *buf,
HWY_ATTR inline void ConvolveHoriz2Tap(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int w, int h) {
+ const bool can_use_pareto_optimal_2tap =
+ (h == 32 && (w == 16 || w == 32 || w == 64)) && (filter_x[3] % 2 == 0) &&
+ (filter_x[4] % 2 == 0);
+
+ if (can_use_pareto_optimal_2tap) {
+ if (w == 16) {
+ constexpr hn::CappedTag<uint8_t, 16> u8_16_tag;
+ constexpr hn::CappedTag<int8_t, 16> i8_16_tag;
+ constexpr hn::CappedTag<int16_t, 8> i16_8_tag;
+
+ const auto shuf34_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+
+ const int8_t c3 = static_cast<int8_t>(filter_x[3] / 2);
+ const int8_t c4 = static_cast<int8_t>(filter_x[4] / 2);
+
+ const auto coeff34_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c3, c4, c3, c4, c3, c4, c3, c4, c3,
+ c4, c3, c4, c3, c4, c3, c4);
+
+ const auto round_vec_16 = hn::Set(i16_8_tag, 1 << (FILTER_BITS - 2));
+
+ for (int y = 0; y < h; ++y) {
+ const uint8_t *src_row = src + y * src_stride;
+ uint8_t *dst_row = dst + y * dst_stride;
+
+ auto v0 = hn::LoadU(u8_16_tag, src_row + 0);
+ auto v8 = hn::LoadU(u8_16_tag, src_row + 8);
+
+ auto p34_0 = hn::TableLookupBytes(v0, shuf34_16);
+ auto res0 = hn::ShiftRightSame(
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_8_tag, p34_0, coeff34_16),
+ round_vec_16),
+ FILTER_BITS - 1);
+
+ auto p34_8 = hn::TableLookupBytes(v8, shuf34_16);
+ auto res8 = hn::ShiftRightSame(
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_8_tag, p34_8, coeff34_16),
+ round_vec_16),
+ FILTER_BITS - 1);
+
+ auto packed = hn::ReorderDemote2To(u8_16_tag, res0, res8);
+ hn::StoreU(packed, u8_16_tag, dst_row);
+ }
+ } else {
+ constexpr hn::CappedTag<uint8_t, 16> u8_16_tag;
+ constexpr hn::CappedTag<uint8_t, 32> u8_32_tag;
+ constexpr hn::CappedTag<int8_t, 16> i8_16_tag;
+ constexpr hn::CappedTag<int8_t, 32> i8_32_tag;
+ constexpr hn::CappedTag<int16_t, 16> i16_16_tag;
+
+ const auto shuf34_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ const auto shuf34_32 = hn::Combine(u8_32_tag, shuf34_16, shuf34_16);
+
+ const int8_t c3 = static_cast<int8_t>(filter_x[3] / 2);
+ const int8_t c4 = static_cast<int8_t>(filter_x[4] / 2);
+
+ const auto coeff34_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c3, c4, c3, c4, c3, c4, c3, c4, c3,
+ c4, c3, c4, c3, c4, c3, c4);
+ const auto coeff34_32 = hn::Combine(i8_32_tag, coeff34_16, coeff34_16);
+
+ const auto round_vec_32 = hn::Set(i16_16_tag, 1 << (FILTER_BITS - 2));
+
+ for (int y = 0; y < h; ++y) {
+ const uint8_t *src_row = src + y * src_stride;
+ uint8_t *dst_row = dst + y * dst_stride;
+
+ for (int j = 0; j < w; j += 32) {
+ auto v_curr = hn::LoadU(u8_32_tag, src_row + j + 0);
+ auto v8 = hn::LoadU(u8_32_tag, src_row + j + 8);
+
+ auto p34_0 = hn::TableLookupBytes(v_curr, shuf34_32);
+ auto res0 = hn::ShiftRightSame(
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_16_tag, p34_0, coeff34_32),
+ round_vec_32),
+ FILTER_BITS - 1);
+
+ auto p34_8 = hn::TableLookupBytes(v8, shuf34_32);
+ auto res8 = hn::ShiftRightSame(
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_16_tag, p34_8, coeff34_32),
+ round_vec_32),
+ FILTER_BITS - 1);
+
+ constexpr hn::CappedTag<uint8_t, 8> u8_8_tag;
+ auto demoted0 = hn::DemoteTo(u8_16_tag, res0);
+ auto demoted8 = hn::DemoteTo(u8_16_tag, res8);
+ auto p0_15 = hn::Combine(u8_16_tag, hn::LowerHalf(u8_8_tag, demoted8),
+ hn::LowerHalf(u8_8_tag, demoted0));
+ auto p16_31 =
+ hn::Combine(u8_16_tag, hn::UpperHalf(u8_8_tag, demoted8),
+ hn::UpperHalf(u8_8_tag, demoted0));
+ hn::StoreU(p0_15, u8_16_tag, dst_row + j);
+ hn::StoreU(p16_31, u8_16_tag, dst_row + j + 16);
+ }
+ }
+ }
+ return;
+ }
+
const bool can_use_optimized_path =
(w <= 32) && (filter_x[3] % 2 == 0) && (filter_x[4] % 2 == 0);
@@ -154,153 +365,66 @@ HWY_ATTR inline void ConvolveHoriz2Tap(const uint8_t *src, ptrdiff_t src_stride,
const auto coeff34 = hn::Dup128VecFromValues(
tag_i8, c3, c4, c3, c4, c3, c4, c3, c4, c3, c4, c3, c4, c3, c4, c3, c4);
+ auto sum_2tap = [&](const uint8_t *s, int offset) {
+ return ComputeChunkSum2Tap(tag16_8, tag8_16, s, offset, shuffle_mask,
+ coeff34);
+ };
+
if (w == 4) {
while (h >= 2) {
- auto r0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r1_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
-
- auto r0_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d0, shuffle_mask), coeff34);
- auto r1_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d0, shuffle_mask), coeff34);
-
- hn::StoreU(
- hn::LowerHalf(tag8_4,
- hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_sum, bias_val)))),
- tag8_4, dst);
- hn::StoreU(
- hn::LowerHalf(tag8_4,
- hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_sum, bias_val)))),
- tag8_4, dst + dst_stride);
-
+ Process2RowsChunk<4>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_2tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<4>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_2tap);
+ }
} else if (w == 8) {
while (h >= 2) {
- auto r0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r1_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
-
- auto r0_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d0, shuffle_mask), coeff34);
- auto r1_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d0, shuffle_mask), coeff34);
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_sum, bias_val))),
- tag8_8, dst);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_sum, bias_val))),
- tag8_8, dst + dst_stride);
-
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_2tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_2tap);
+ }
} else if (w == 16) {
while (h >= 2) {
- auto r0_j0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_j8_d0 = hn::LoadU(tag8_16, src + 8);
-
- auto r1_j0_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_j8_d0 = hn::LoadU(tag8_16, src + src_stride + 8);
-
- auto r0_j0_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d0, shuffle_mask), coeff34);
- auto r0_j8_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d0, shuffle_mask), coeff34);
-
- auto r1_j0_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d0, shuffle_mask), coeff34);
- auto r1_j8_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d0, shuffle_mask), coeff34);
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j0_sum, bias_val))),
- tag8_8, dst + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j8_sum, bias_val))),
- tag8_8, dst + 8);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j0_sum, bias_val))),
- tag8_8, dst + dst_stride + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j8_sum, bias_val))),
- tag8_8, dst + dst_stride + 8);
-
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_2tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 8, bias_val, sum_2tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_2tap);
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 8, bias_val, sum_2tap);
+ }
} else if (w == 32) {
while (h >= 2) {
- {
- auto r0_j0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_j8_d0 = hn::LoadU(tag8_16, src + 8);
-
- auto r1_j0_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_j8_d0 = hn::LoadU(tag8_16, src + src_stride + 8);
-
- auto r0_j0_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d0, shuffle_mask), coeff34);
- auto r0_j8_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d0, shuffle_mask), coeff34);
-
- auto r1_j0_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d0, shuffle_mask), coeff34);
- auto r1_j8_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d0, shuffle_mask), coeff34);
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j0_sum, bias_val))),
- tag8_8, dst + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j8_sum, bias_val))),
- tag8_8, dst + 8);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j0_sum, bias_val))),
- tag8_8, dst + dst_stride + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j8_sum, bias_val))),
- tag8_8, dst + dst_stride + 8);
- }
- {
- auto r0_j16_d0 = hn::LoadU(tag8_16, src + 16);
- auto r0_j24_d0 = hn::LoadU(tag8_16, src + 24);
-
- auto r1_j16_d0 = hn::LoadU(tag8_16, src + src_stride + 16);
- auto r1_j24_d0 = hn::LoadU(tag8_16, src + src_stride + 24);
-
- auto r0_j16_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j16_d0, shuffle_mask), coeff34);
- auto r0_j24_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j24_d0, shuffle_mask), coeff34);
-
- auto r1_j16_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j16_d0, shuffle_mask), coeff34);
- auto r1_j24_sum = hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j24_d0, shuffle_mask), coeff34);
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j16_sum, bias_val))),
- tag8_8, dst + 16);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j24_sum, bias_val))),
- tag8_8, dst + 24);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j16_sum, bias_val))),
- tag8_8, dst + dst_stride + 16);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j24_sum, bias_val))),
- tag8_8, dst + dst_stride + 24);
- }
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_2tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 8, bias_val, sum_2tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 16, bias_val, sum_2tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 24, bias_val, sum_2tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ for (int j = 0; j < 32; j += 8) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, j, bias_val, sum_2tap);
+ }
+ }
}
} else {
hn::ScalableTag<int16_t> mul_tag;
@@ -345,6 +469,134 @@ HWY_ATTR HWY_INLINE hn::VFromD<D> Convolve4_8(
HWY_ATTR inline void ConvolveHoriz4Tap(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int w, int h) {
+ const bool can_use_pareto_optimal_4tap =
+ (h == 32 && (w == 16 || w == 32 || w == 64)) && (filter_x[2] % 2 == 0) &&
+ (filter_x[3] % 2 == 0) && (filter_x[4] % 2 == 0) &&
+ (filter_x[5] % 2 == 0);
+
+ if (can_use_pareto_optimal_4tap) {
+ if (w == 16) {
+ constexpr hn::CappedTag<uint8_t, 16> u8_16_tag;
+ constexpr hn::CappedTag<int8_t, 16> i8_16_tag;
+ constexpr hn::CappedTag<int16_t, 8> i16_8_tag;
+
+ const auto shuf01_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ const auto shuf23_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ const int8_t c2 = static_cast<int8_t>(filter_x[2] / 2);
+ const int8_t c3 = static_cast<int8_t>(filter_x[3] / 2);
+ const int8_t c4 = static_cast<int8_t>(filter_x[4] / 2);
+ const int8_t c5 = static_cast<int8_t>(filter_x[5] / 2);
+
+ const auto coeff23_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c2, c3, c2, c3, c2, c3, c2, c3, c2,
+ c3, c2, c3, c2, c3, c2, c3);
+ const auto coeff45_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c4, c5, c4, c5, c4, c5, c4, c5, c4,
+ c5, c4, c5, c4, c5, c4, c5);
+
+ const auto round_vec_16 = hn::Set(i16_8_tag, 1 << (FILTER_BITS - 2));
+
+ for (int y = 0; y < h; ++y) {
+ const uint8_t *src_row = src + y * src_stride;
+ uint8_t *dst_row = dst + y * dst_stride;
+
+ auto v0 = hn::LoadU(u8_16_tag, src_row + 0);
+ auto v8 = hn::LoadU(u8_16_tag, src_row + 8);
+
+ auto p01_0 = hn::TableLookupBytes(v0, shuf01_16);
+ auto p23_0 = hn::TableLookupBytes(v0, shuf23_16);
+ auto sum0 =
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_8_tag, p01_0, coeff23_16),
+ hn::SatWidenMulPairwiseAdd(i16_8_tag, p23_0, coeff45_16));
+ auto res0 =
+ hn::ShiftRightSame(hn::Add(sum0, round_vec_16), FILTER_BITS - 1);
+
+ auto p01_8 = hn::TableLookupBytes(v8, shuf01_16);
+ auto p23_8 = hn::TableLookupBytes(v8, shuf23_16);
+ auto sum0_8 =
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_8_tag, p01_8, coeff23_16),
+ hn::SatWidenMulPairwiseAdd(i16_8_tag, p23_8, coeff45_16));
+ auto res8 =
+ hn::ShiftRightSame(hn::Add(sum0_8, round_vec_16), FILTER_BITS - 1);
+
+ auto packed = hn::ReorderDemote2To(u8_16_tag, res0, res8);
+ hn::StoreU(packed, u8_16_tag, dst_row);
+ }
+ } else {
+ constexpr hn::CappedTag<uint8_t, 16> u8_16_tag;
+ constexpr hn::CappedTag<uint8_t, 32> u8_32_tag;
+ constexpr hn::CappedTag<int8_t, 16> i8_16_tag;
+ constexpr hn::CappedTag<int8_t, 32> i8_32_tag;
+ constexpr hn::CappedTag<int16_t, 16> i16_16_tag;
+
+ const auto shuf01_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ const auto shuf23_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ const auto shuf01_32 = hn::Combine(u8_32_tag, shuf01_16, shuf01_16);
+ const auto shuf23_32 = hn::Combine(u8_32_tag, shuf23_16, shuf23_16);
+
+ const int8_t c2 = static_cast<int8_t>(filter_x[2] / 2);
+ const int8_t c3 = static_cast<int8_t>(filter_x[3] / 2);
+ const int8_t c4 = static_cast<int8_t>(filter_x[4] / 2);
+ const int8_t c5 = static_cast<int8_t>(filter_x[5] / 2);
+
+ const auto coeff23_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c2, c3, c2, c3, c2, c3, c2, c3, c2,
+ c3, c2, c3, c2, c3, c2, c3);
+ const auto coeff45_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c4, c5, c4, c5, c4, c5, c4, c5, c4,
+ c5, c4, c5, c4, c5, c4, c5);
+
+ const auto coeff23_32 = hn::Combine(i8_32_tag, coeff23_16, coeff23_16);
+ const auto coeff45_32 = hn::Combine(i8_32_tag, coeff45_16, coeff45_16);
+
+ const auto round_vec_32 = hn::Set(i16_16_tag, 1 << (FILTER_BITS - 2));
+
+ for (int y = 0; y < h; ++y) {
+ const uint8_t *src_row = src + y * src_stride;
+ uint8_t *dst_row = dst + y * dst_stride;
+
+ for (int j = 0; j < w; j += 32) {
+ auto v_curr = hn::LoadU(u8_32_tag, src_row + j + 0);
+ auto v8 = hn::LoadU(u8_32_tag, src_row + j + 8);
+
+ auto p01_0 = hn::TableLookupBytes(v_curr, shuf01_32);
+ auto p23_0 = hn::TableLookupBytes(v_curr, shuf23_32);
+ auto sum0 = hn::Add(
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p01_0, coeff23_32),
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p23_0, coeff45_32));
+ auto res0 =
+ hn::ShiftRightSame(hn::Add(sum0, round_vec_32), FILTER_BITS - 1);
+
+ auto p01_8 = hn::TableLookupBytes(v8, shuf01_32);
+ auto p23_8 = hn::TableLookupBytes(v8, shuf23_32);
+ auto sum0_8 = hn::Add(
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p01_8, coeff23_32),
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p23_8, coeff45_32));
+ auto res8 = hn::ShiftRightSame(hn::Add(sum0_8, round_vec_32),
+ FILTER_BITS - 1);
+
+ constexpr hn::CappedTag<uint8_t, 8> u8_8_tag;
+ auto demoted0 = hn::DemoteTo(u8_16_tag, res0);
+ auto demoted8 = hn::DemoteTo(u8_16_tag, res8);
+ auto p0_15 = hn::Combine(u8_16_tag, hn::LowerHalf(u8_8_tag, demoted8),
+ hn::LowerHalf(u8_8_tag, demoted0));
+ auto p16_31 =
+ hn::Combine(u8_16_tag, hn::UpperHalf(u8_8_tag, demoted8),
+ hn::UpperHalf(u8_8_tag, demoted0));
+ hn::StoreU(p0_15, u8_16_tag, dst_row + j);
+ hn::StoreU(p16_31, u8_16_tag, dst_row + j + 16);
+ }
+ }
+ }
+ return;
+ }
+
const bool can_use_optimized_path =
(w <= 32) && (filter_x[2] % 2 == 0) && (filter_x[3] % 2 == 0) &&
(filter_x[4] % 2 == 0) && (filter_x[5] % 2 == 0);
@@ -371,253 +623,66 @@ HWY_ATTR inline void ConvolveHoriz4Tap(const uint8_t *src, ptrdiff_t src_stride,
const auto coeff45 = hn::Dup128VecFromValues(
tag_i8, c4, c5, c4, c5, c4, c5, c4, c5, c4, c5, c4, c5, c4, c5, c4, c5);
+ auto sum_4tap = [&](const uint8_t *s, int offset) {
+ return ComputeChunkSum4Tap(tag16_8, tag8_16, s, offset, shuffle_mask,
+ coeff23, coeff45);
+ };
+
if (w == 4) {
while (h >= 2) {
- auto r0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_d2 = hn::LoadU(tag8_16, src + 2);
-
- auto r1_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_d2 = hn::LoadU(tag8_16, src + src_stride + 2);
-
- auto r0_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d0, shuffle_mask), coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d2, shuffle_mask), coeff45));
-
- auto r1_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d0, shuffle_mask), coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d2, shuffle_mask), coeff45));
-
- hn::StoreU(
- hn::LowerHalf(tag8_4,
- hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_sum, bias_val)))),
- tag8_4, dst);
- hn::StoreU(
- hn::LowerHalf(tag8_4,
- hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_sum, bias_val)))),
- tag8_4, dst + dst_stride);
-
+ Process2RowsChunk<4>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_4tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<4>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_4tap);
+ }
} else if (w == 8) {
while (h >= 2) {
- auto r0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_d2 = hn::LoadU(tag8_16, src + 2);
-
- auto r1_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_d2 = hn::LoadU(tag8_16, src + src_stride + 2);
-
- auto r0_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d0, shuffle_mask), coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d2, shuffle_mask), coeff45));
-
- auto r1_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d0, shuffle_mask), coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d2, shuffle_mask), coeff45));
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_sum, bias_val))),
- tag8_8, dst);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_sum, bias_val))),
- tag8_8, dst + dst_stride);
-
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_4tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_4tap);
+ }
} else if (w == 16) {
while (h >= 2) {
- auto r0_j0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_j0_d2 = hn::LoadU(tag8_16, src + 2);
-
- auto r0_j8_d0 = hn::LoadU(tag8_16, src + 8);
- auto r0_j8_d2 = hn::LoadU(tag8_16, src + 10);
-
- auto r1_j0_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_j0_d2 = hn::LoadU(tag8_16, src + src_stride + 2);
-
- auto r1_j8_d0 = hn::LoadU(tag8_16, src + src_stride + 8);
- auto r1_j8_d2 = hn::LoadU(tag8_16, src + src_stride + 10);
-
- auto r0_j0_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d0, shuffle_mask), coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d2, shuffle_mask),
- coeff45));
-
- auto r0_j8_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d0, shuffle_mask), coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d2, shuffle_mask),
- coeff45));
-
- auto r1_j0_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d0, shuffle_mask), coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d2, shuffle_mask),
- coeff45));
-
- auto r1_j8_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d0, shuffle_mask), coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d2, shuffle_mask),
- coeff45));
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j0_sum, bias_val))),
- tag8_8, dst + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j8_sum, bias_val))),
- tag8_8, dst + 8);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j0_sum, bias_val))),
- tag8_8, dst + dst_stride + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j8_sum, bias_val))),
- tag8_8, dst + dst_stride + 8);
-
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_4tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 8, bias_val, sum_4tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_4tap);
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 8, bias_val, sum_4tap);
+ }
} else if (w == 32) {
while (h >= 2) {
- {
- auto r0_j0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_j0_d2 = hn::LoadU(tag8_16, src + 2);
-
- auto r0_j8_d0 = hn::LoadU(tag8_16, src + 8);
- auto r0_j8_d2 = hn::LoadU(tag8_16, src + 10);
-
- auto r1_j0_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_j0_d2 = hn::LoadU(tag8_16, src + src_stride + 2);
-
- auto r1_j8_d0 = hn::LoadU(tag8_16, src + src_stride + 8);
- auto r1_j8_d2 = hn::LoadU(tag8_16, src + src_stride + 10);
-
- auto r0_j0_sum =
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d0, shuffle_mask),
- coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d2, shuffle_mask),
- coeff45));
-
- auto r0_j8_sum =
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d0, shuffle_mask),
- coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d2, shuffle_mask),
- coeff45));
-
- auto r1_j0_sum =
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d0, shuffle_mask),
- coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d2, shuffle_mask),
- coeff45));
-
- auto r1_j8_sum =
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d0, shuffle_mask),
- coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d2, shuffle_mask),
- coeff45));
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j0_sum, bias_val))),
- tag8_8, dst + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j8_sum, bias_val))),
- tag8_8, dst + 8);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j0_sum, bias_val))),
- tag8_8, dst + dst_stride + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j8_sum, bias_val))),
- tag8_8, dst + dst_stride + 8);
- }
- {
- auto r0_j16_d0 = hn::LoadU(tag8_16, src + 16);
- auto r0_j16_d2 = hn::LoadU(tag8_16, src + 18);
-
- auto r0_j24_d0 = hn::LoadU(tag8_16, src + 24);
- auto r0_j24_d2 = hn::LoadU(tag8_16, src + 26);
-
- auto r1_j16_d0 = hn::LoadU(tag8_16, src + src_stride + 16);
- auto r1_j16_d2 = hn::LoadU(tag8_16, src + src_stride + 18);
-
- auto r1_j24_d0 = hn::LoadU(tag8_16, src + src_stride + 24);
- auto r1_j24_d2 = hn::LoadU(tag8_16, src + src_stride + 26);
-
- auto r0_j16_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j16_d0, shuffle_mask),
- coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j16_d2, shuffle_mask),
- coeff45));
-
- auto r0_j24_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j24_d0, shuffle_mask),
- coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j24_d2, shuffle_mask),
- coeff45));
-
- auto r1_j16_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j16_d0, shuffle_mask),
- coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j16_d2, shuffle_mask),
- coeff45));
-
- auto r1_j24_sum = hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j24_d0, shuffle_mask),
- coeff23),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j24_d2, shuffle_mask),
- coeff45));
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j16_sum, bias_val))),
- tag8_8, dst + 16);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j24_sum, bias_val))),
- tag8_8, dst + 24);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j16_sum, bias_val))),
- tag8_8, dst + dst_stride + 16);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j24_sum, bias_val))),
- tag8_8, dst + dst_stride + 24);
- }
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_4tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 8, bias_val, sum_4tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 16, bias_val, sum_4tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 24, bias_val, sum_4tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ for (int j = 0; j < 32; j += 8) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, j, bias_val, sum_4tap);
+ }
+ }
}
} else {
hn::ScalableTag<int16_t> mul_tag;
@@ -678,6 +743,189 @@ HWY_ATTR HWY_INLINE hn::VFromD<D> Convolve8_8(
HWY_ATTR inline void ConvolveHoriz8Tap(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int w, int h) {
+ const bool can_use_pareto_optimal_8tap =
+ (h == 32 && (w == 16 || w == 32 || w == 64)) && (filter_x[0] % 2 == 0) &&
+ (filter_x[1] % 2 == 0) && (filter_x[2] % 2 == 0) &&
+ (filter_x[3] % 2 == 0) && (filter_x[4] % 2 == 0) &&
+ (filter_x[5] % 2 == 0) && (filter_x[6] % 2 == 0) &&
+ (filter_x[7] % 2 == 0);
+
+ if (can_use_pareto_optimal_8tap) {
+ if (w == 16) {
+ constexpr hn::CappedTag<uint8_t, 16> u8_16_tag;
+ constexpr hn::CappedTag<int8_t, 16> i8_16_tag;
+ constexpr hn::CappedTag<int16_t, 8> i16_8_tag;
+
+ // Construct shuffles locally (no filt_global load)
+ const auto shuf01_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ const auto shuf23_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+ const auto shuf45_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+ const auto shuf67_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
+
+ const int8_t c0 = static_cast<int8_t>(filter_x[0] / 2);
+ const int8_t c1 = static_cast<int8_t>(filter_x[1] / 2);
+ const int8_t c2 = static_cast<int8_t>(filter_x[2] / 2);
+ const int8_t c3 = static_cast<int8_t>(filter_x[3] / 2);
+ const int8_t c4 = static_cast<int8_t>(filter_x[4] / 2);
+ const int8_t c5 = static_cast<int8_t>(filter_x[5] / 2);
+ const int8_t c6 = static_cast<int8_t>(filter_x[6] / 2);
+ const int8_t c7 = static_cast<int8_t>(filter_x[7] / 2);
+
+ const auto coeff01_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c0, c1, c0, c1, c0, c1, c0, c1, c0,
+ c1, c0, c1, c0, c1, c0, c1);
+ const auto coeff23_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c2, c3, c2, c3, c2, c3, c2, c3, c2,
+ c3, c2, c3, c2, c3, c2, c3);
+ const auto coeff45_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c4, c5, c4, c5, c4, c5, c4, c5, c4,
+ c5, c4, c5, c4, c5, c4, c5);
+ const auto coeff67_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c6, c7, c6, c7, c6, c7, c6, c7, c6,
+ c7, c6, c7, c6, c7, c6, c7);
+
+ const auto round_vec_16 = hn::Set(i16_8_tag, 1 << (FILTER_BITS - 2));
+
+ for (int y = 0; y < h; ++y) {
+ const uint8_t *src_row = src + y * src_stride;
+ uint8_t *dst_row = dst + y * dst_stride;
+
+ auto v0 = hn::LoadU(u8_16_tag, src_row + 0);
+ auto v8 = hn::LoadU(u8_16_tag, src_row + 8);
+
+ auto p01_0 = hn::TableLookupBytes(v0, shuf01_16);
+ auto p23_0 = hn::TableLookupBytes(v0, shuf23_16);
+ auto p45_0 = hn::TableLookupBytes(v0, shuf45_16);
+ auto p67_0 = hn::TableLookupBytes(v0, shuf67_16);
+ auto sum0 =
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_8_tag, p01_0, coeff01_16),
+ hn::SatWidenMulPairwiseAdd(i16_8_tag, p23_0, coeff23_16));
+ auto sum1 =
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_8_tag, p45_0, coeff45_16),
+ hn::SatWidenMulPairwiseAdd(i16_8_tag, p67_0, coeff67_16));
+ auto res0 = hn::ShiftRightSame(
+ hn::Add(hn::Add(sum0, sum1), round_vec_16), FILTER_BITS - 1);
+
+ auto p01_8 = hn::TableLookupBytes(v8, shuf01_16);
+ auto p23_8 = hn::TableLookupBytes(v8, shuf23_16);
+ auto p45_8 = hn::TableLookupBytes(v8, shuf45_16);
+ auto p67_8 = hn::TableLookupBytes(v8, shuf67_16);
+ auto sum0_8 =
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_8_tag, p01_8, coeff01_16),
+ hn::SatWidenMulPairwiseAdd(i16_8_tag, p23_8, coeff23_16));
+ auto sum1_8 =
+ hn::Add(hn::SatWidenMulPairwiseAdd(i16_8_tag, p45_8, coeff45_16),
+ hn::SatWidenMulPairwiseAdd(i16_8_tag, p67_8, coeff67_16));
+ auto res8 = hn::ShiftRightSame(
+ hn::Add(hn::Add(sum0_8, sum1_8), round_vec_16), FILTER_BITS - 1);
+
+ auto packed = hn::ReorderDemote2To(u8_16_tag, res0, res8);
+ hn::StoreU(packed, u8_16_tag, dst_row);
+ }
+ } else {
+ constexpr hn::CappedTag<uint8_t, 16> u8_16_tag;
+ constexpr hn::CappedTag<uint8_t, 32> u8_32_tag;
+ constexpr hn::CappedTag<int8_t, 16> i8_16_tag;
+ constexpr hn::CappedTag<int8_t, 32> i8_32_tag;
+ constexpr hn::CappedTag<int16_t, 16> i16_16_tag;
+
+ const auto shuf01_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ const auto shuf23_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+ const auto shuf45_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+ const auto shuf67_16 = hn::Dup128VecFromValues(
+ u8_16_tag, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
+
+ const auto shuf01_32 = hn::Combine(u8_32_tag, shuf01_16, shuf01_16);
+ const auto shuf23_32 = hn::Combine(u8_32_tag, shuf23_16, shuf23_16);
+ const auto shuf45_32 = hn::Combine(u8_32_tag, shuf45_16, shuf45_16);
+ const auto shuf67_32 = hn::Combine(u8_32_tag, shuf67_16, shuf67_16);
+
+ const int8_t c0 = static_cast<int8_t>(filter_x[0] / 2);
+ const int8_t c1 = static_cast<int8_t>(filter_x[1] / 2);
+ const int8_t c2 = static_cast<int8_t>(filter_x[2] / 2);
+ const int8_t c3 = static_cast<int8_t>(filter_x[3] / 2);
+ const int8_t c4 = static_cast<int8_t>(filter_x[4] / 2);
+ const int8_t c5 = static_cast<int8_t>(filter_x[5] / 2);
+ const int8_t c6 = static_cast<int8_t>(filter_x[6] / 2);
+ const int8_t c7 = static_cast<int8_t>(filter_x[7] / 2);
+
+ const auto coeff01_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c0, c1, c0, c1, c0, c1, c0, c1, c0,
+ c1, c0, c1, c0, c1, c0, c1);
+ const auto coeff23_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c2, c3, c2, c3, c2, c3, c2, c3, c2,
+ c3, c2, c3, c2, c3, c2, c3);
+ const auto coeff45_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c4, c5, c4, c5, c4, c5, c4, c5, c4,
+ c5, c4, c5, c4, c5, c4, c5);
+ const auto coeff67_16 =
+ hn::Dup128VecFromValues(i8_16_tag, c6, c7, c6, c7, c6, c7, c6, c7, c6,
+ c7, c6, c7, c6, c7, c6, c7);
+
+ const auto coeff01_32 = hn::Combine(i8_32_tag, coeff01_16, coeff01_16);
+ const auto coeff23_32 = hn::Combine(i8_32_tag, coeff23_16, coeff23_16);
+ const auto coeff45_32 = hn::Combine(i8_32_tag, coeff45_16, coeff45_16);
+ const auto coeff67_32 = hn::Combine(i8_32_tag, coeff67_16, coeff67_16);
+
+ const auto round_vec_32 = hn::Set(i16_16_tag, 1 << (FILTER_BITS - 2));
+
+ for (int y = 0; y < h; ++y) {
+ const uint8_t *src_row = src + y * src_stride;
+ uint8_t *dst_row = dst + y * dst_stride;
+
+ for (int j = 0; j < w; j += 32) {
+ auto v_curr = hn::LoadU(u8_32_tag, src_row + j + 0);
+ auto v8 = hn::LoadU(u8_32_tag, src_row + j + 8);
+
+ auto p01_0 = hn::TableLookupBytes(v_curr, shuf01_32);
+ auto p23_0 = hn::TableLookupBytes(v_curr, shuf23_32);
+ auto p45_0 = hn::TableLookupBytes(v_curr, shuf45_32);
+ auto p67_0 = hn::TableLookupBytes(v_curr, shuf67_32);
+ auto sum0 = hn::Add(
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p01_0, coeff01_32),
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p23_0, coeff23_32));
+ auto sum1 = hn::Add(
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p45_0, coeff45_32),
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p67_0, coeff67_32));
+ auto res0 = hn::ShiftRightSame(
+ hn::Add(hn::Add(sum0, sum1), round_vec_32), FILTER_BITS - 1);
+
+ auto p01_8 = hn::TableLookupBytes(v8, shuf01_32);
+ auto p23_8 = hn::TableLookupBytes(v8, shuf23_32);
+ auto p45_8 = hn::TableLookupBytes(v8, shuf45_32);
+ auto p67_8 = hn::TableLookupBytes(v8, shuf67_32);
+ auto sum0_8 = hn::Add(
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p01_8, coeff01_32),
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p23_8, coeff23_32));
+ auto sum1_8 = hn::Add(
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p45_8, coeff45_32),
+ hn::SatWidenMulPairwiseAdd(i16_16_tag, p67_8, coeff67_32));
+ auto res8 = hn::ShiftRightSame(
+ hn::Add(hn::Add(sum0_8, sum1_8), round_vec_32), FILTER_BITS - 1);
+
+ constexpr hn::CappedTag<uint8_t, 8> u8_8_tag;
+ auto demoted0 = hn::DemoteTo(u8_16_tag, res0);
+ auto demoted8 = hn::DemoteTo(u8_16_tag, res8);
+ auto p0_15 = hn::Combine(u8_16_tag, hn::LowerHalf(u8_8_tag, demoted8),
+ hn::LowerHalf(u8_8_tag, demoted0));
+ auto p16_31 =
+ hn::Combine(u8_16_tag, hn::UpperHalf(u8_8_tag, demoted8),
+ hn::UpperHalf(u8_8_tag, demoted0));
+ hn::StoreU(p0_15, u8_16_tag, dst_row + j);
+ hn::StoreU(p16_31, u8_16_tag, dst_row + j + 16);
+ }
+ }
+ }
+ return;
+ }
+
const bool can_use_optimized_path =
(w <= 32) && (filter_x[0] % 2 == 0) && (filter_x[1] % 2 == 0) &&
(filter_x[2] % 2 == 0) && (filter_x[3] % 2 == 0) &&
@@ -716,401 +964,66 @@ HWY_ATTR inline void ConvolveHoriz8Tap(const uint8_t *src, ptrdiff_t src_stride,
const auto coeff67 = hn::Dup128VecFromValues(
tag_i8, c6, c7, c6, c7, c6, c7, c6, c7, c6, c7, c6, c7, c6, c7, c6, c7);
+ auto sum_8tap = [&](const uint8_t *s, int offset) {
+ return ComputeChunkSum8Tap(tag16_8, tag8_16, s, offset, shuffle_mask,
+ coeff01, coeff23, coeff45, coeff67);
+ };
+
if (w == 4) {
while (h >= 2) {
- auto r0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_d2 = hn::LoadU(tag8_16, src + 2);
- auto r0_d4 = hn::LoadU(tag8_16, src + 4);
- auto r0_d6 = hn::LoadU(tag8_16, src + 6);
-
- auto r1_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_d2 = hn::LoadU(tag8_16, src + src_stride + 2);
- auto r1_d4 = hn::LoadU(tag8_16, src + src_stride + 4);
- auto r1_d6 = hn::LoadU(tag8_16, src + src_stride + 6);
-
- auto r0_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d6, shuffle_mask),
- coeff67)));
-
- auto r1_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d6, shuffle_mask),
- coeff67)));
-
- hn::StoreU(
- hn::LowerHalf(tag8_4,
- hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_sum, bias_val)))),
- tag8_4, dst);
- hn::StoreU(
- hn::LowerHalf(tag8_4,
- hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_sum, bias_val)))),
- tag8_4, dst + dst_stride);
-
+ Process2RowsChunk<4>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_8tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<4>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_8tap);
+ }
} else if (w == 8) {
while (h >= 2) {
- auto r0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_d2 = hn::LoadU(tag8_16, src + 2);
- auto r0_d4 = hn::LoadU(tag8_16, src + 4);
- auto r0_d6 = hn::LoadU(tag8_16, src + 6);
-
- auto r1_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_d2 = hn::LoadU(tag8_16, src + src_stride + 2);
- auto r1_d4 = hn::LoadU(tag8_16, src + src_stride + 4);
- auto r1_d6 = hn::LoadU(tag8_16, src + src_stride + 6);
-
- auto r0_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_d6, shuffle_mask),
- coeff67)));
-
- auto r1_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_d6, shuffle_mask),
- coeff67)));
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_sum, bias_val))),
- tag8_8, dst);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_sum, bias_val))),
- tag8_8, dst + dst_stride);
-
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_8tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_8tap);
+ }
} else if (w == 16) {
while (h >= 2) {
- auto r0_j0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_j0_d2 = hn::LoadU(tag8_16, src + 2);
- auto r0_j0_d4 = hn::LoadU(tag8_16, src + 4);
- auto r0_j0_d6 = hn::LoadU(tag8_16, src + 6);
-
- auto r0_j8_d0 = hn::LoadU(tag8_16, src + 8);
- auto r0_j8_d2 = hn::LoadU(tag8_16, src + 10);
- auto r0_j8_d4 = hn::LoadU(tag8_16, src + 12);
- auto r0_j8_d6 = hn::LoadU(tag8_16, src + 14);
-
- auto r1_j0_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_j0_d2 = hn::LoadU(tag8_16, src + src_stride + 2);
- auto r1_j0_d4 = hn::LoadU(tag8_16, src + src_stride + 4);
- auto r1_j0_d6 = hn::LoadU(tag8_16, src + src_stride + 6);
-
- auto r1_j8_d0 = hn::LoadU(tag8_16, src + src_stride + 8);
- auto r1_j8_d2 = hn::LoadU(tag8_16, src + src_stride + 10);
- auto r1_j8_d4 = hn::LoadU(tag8_16, src + src_stride + 12);
- auto r1_j8_d6 = hn::LoadU(tag8_16, src + src_stride + 14);
-
- auto r0_j0_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d6, shuffle_mask),
- coeff67)));
-
- auto r0_j8_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d6, shuffle_mask),
- coeff67)));
-
- auto r1_j0_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d6, shuffle_mask),
- coeff67)));
-
- auto r1_j8_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d6, shuffle_mask),
- coeff67)));
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j0_sum, bias_val))),
- tag8_8, dst + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j8_sum, bias_val))),
- tag8_8, dst + 8);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j0_sum, bias_val))),
- tag8_8, dst + dst_stride + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j8_sum, bias_val))),
- tag8_8, dst + dst_stride + 8);
-
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_8tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 8, bias_val, sum_8tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 0, bias_val, sum_8tap);
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, 8, bias_val, sum_8tap);
+ }
} else if (w == 32) {
while (h >= 2) {
- {
- auto r0_j0_d0 = hn::LoadU(tag8_16, src + 0);
- auto r0_j0_d2 = hn::LoadU(tag8_16, src + 2);
- auto r0_j0_d4 = hn::LoadU(tag8_16, src + 4);
- auto r0_j0_d6 = hn::LoadU(tag8_16, src + 6);
-
- auto r0_j8_d0 = hn::LoadU(tag8_16, src + 8);
- auto r0_j8_d2 = hn::LoadU(tag8_16, src + 10);
- auto r0_j8_d4 = hn::LoadU(tag8_16, src + 12);
- auto r0_j8_d6 = hn::LoadU(tag8_16, src + 14);
-
- auto r1_j0_d0 = hn::LoadU(tag8_16, src + src_stride + 0);
- auto r1_j0_d2 = hn::LoadU(tag8_16, src + src_stride + 2);
- auto r1_j0_d4 = hn::LoadU(tag8_16, src + src_stride + 4);
- auto r1_j0_d6 = hn::LoadU(tag8_16, src + src_stride + 6);
-
- auto r1_j8_d0 = hn::LoadU(tag8_16, src + src_stride + 8);
- auto r1_j8_d2 = hn::LoadU(tag8_16, src + src_stride + 10);
- auto r1_j8_d4 = hn::LoadU(tag8_16, src + src_stride + 12);
- auto r1_j8_d6 = hn::LoadU(tag8_16, src + src_stride + 14);
-
- auto r0_j0_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j0_d6, shuffle_mask),
- coeff67)));
-
- auto r0_j8_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j8_d6, shuffle_mask),
- coeff67)));
-
- auto r1_j0_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j0_d6, shuffle_mask),
- coeff67)));
-
- auto r1_j8_sum = hn::Add(
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d2, shuffle_mask),
- coeff23)),
- hn::Add(hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j8_d6, shuffle_mask),
- coeff67)));
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j0_sum, bias_val))),
- tag8_8, dst + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j8_sum, bias_val))),
- tag8_8, dst + 8);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j0_sum, bias_val))),
- tag8_8, dst + dst_stride + 0);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j8_sum, bias_val))),
- tag8_8, dst + dst_stride + 8);
- }
- {
- auto r0_j16_d0 = hn::LoadU(tag8_16, src + 16);
- auto r0_j16_d2 = hn::LoadU(tag8_16, src + 18);
- auto r0_j16_d4 = hn::LoadU(tag8_16, src + 20);
- auto r0_j16_d6 = hn::LoadU(tag8_16, src + 22);
-
- auto r0_j24_d0 = hn::LoadU(tag8_16, src + 24);
- auto r0_j24_d2 = hn::LoadU(tag8_16, src + 26);
- auto r0_j24_d4 = hn::LoadU(tag8_16, src + 28);
- auto r0_j24_d6 = hn::LoadU(tag8_16, src + 30);
-
- auto r1_j16_d0 = hn::LoadU(tag8_16, src + src_stride + 16);
- auto r1_j16_d2 = hn::LoadU(tag8_16, src + src_stride + 18);
- auto r1_j16_d4 = hn::LoadU(tag8_16, src + src_stride + 20);
- auto r1_j16_d6 = hn::LoadU(tag8_16, src + src_stride + 22);
-
- auto r1_j24_d0 = hn::LoadU(tag8_16, src + src_stride + 24);
- auto r1_j24_d2 = hn::LoadU(tag8_16, src + src_stride + 26);
- auto r1_j24_d4 = hn::LoadU(tag8_16, src + src_stride + 28);
- auto r1_j24_d6 = hn::LoadU(tag8_16, src + src_stride + 30);
-
- auto r0_j16_sum = hn::Add(
- hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j16_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j16_d2, shuffle_mask),
- coeff23)),
- hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j16_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j16_d6, shuffle_mask),
- coeff67)));
-
- auto r0_j24_sum = hn::Add(
- hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j24_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j24_d2, shuffle_mask),
- coeff23)),
- hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j24_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r0_j24_d6, shuffle_mask),
- coeff67)));
-
- auto r1_j16_sum = hn::Add(
- hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j16_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j16_d2, shuffle_mask),
- coeff23)),
- hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j16_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j16_d6, shuffle_mask),
- coeff67)));
-
- auto r1_j24_sum = hn::Add(
- hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j24_d0, shuffle_mask),
- coeff01),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j24_d2, shuffle_mask),
- coeff23)),
- hn::Add(
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j24_d4, shuffle_mask),
- coeff45),
- hn::SatWidenMulPairwiseAdd(
- tag16_8, hn::TableLookupBytes(r1_j24_d6, shuffle_mask),
- coeff67)));
-
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j16_sum, bias_val))),
- tag8_8, dst + 16);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r0_j24_sum, bias_val))),
- tag8_8, dst + 24);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j16_sum, bias_val))),
- tag8_8, dst + dst_stride + 16);
- hn::StoreU(hn::DemoteTo(tag8_8, hn::ShiftRight<FILTER_BITS - 1>(
- hn::Add(r1_j24_sum, bias_val))),
- tag8_8, dst + dst_stride + 24);
- }
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 0, bias_val, sum_8tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 8, bias_val, sum_8tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 16, bias_val, sum_8tap);
+ Process2RowsChunk<8>(tag8_8, tag8_4, src, src_stride, dst, dst_stride,
+ 24, bias_val, sum_8tap);
src += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
}
+ if (h & 1) {
+ for (int j = 0; j < 32; j += 8) {
+ Process1RowChunk<8>(tag8_8, tag8_4, src, dst, j, bias_val, sum_8tap);
+ }
+ }
}
} else {
hn::CappedTag<int16_t, 8> filter_tag;
@@ -1522,6 +1435,16 @@ HWY_MAYBE_UNUSED void Convolve8Horiz(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
+#if HAVE_SSSE3
+ // TODO: jianj - 16x32 block is still fastest with handwritten avx2 which
+ // uses ssse3 implementation. Further optimize for this case.
+ if (w == 16 && h == 32) {
+ aom_convolve8_horiz_ssse3(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h);
+ return;
+ }
+#endif
+
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index c46d146138..ab8041296e 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -42,6 +42,7 @@
#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
#endif // __clang__
+#if !CONFIG_HIGHWAY
static inline void xx_storeu2_epi32(const uint8_t *output_ptr,
const ptrdiff_t stride, const __m256i *a) {
*((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
@@ -784,8 +785,6 @@ static void aom_filter_block1d16_h8_avx2(
}
}
-#if !CONFIG_HIGHWAY
-
static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
__m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
@@ -1405,6 +1404,7 @@ static void aom_filter_block1d4_v4_avx2(
#endif // !CONFIG_HIGHWAY
#if HAVE_AVX2 && HAVE_SSSE3
+#if !CONFIG_HIGHWAY
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
filter8_1dfunction aom_filter_block1d8_h2_ssse3;
filter8_1dfunction aom_filter_block1d4_h2_ssse3;
@@ -1419,7 +1419,6 @@ filter8_1dfunction aom_filter_block1d4_h2_ssse3;
// int w, int h);
FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
-#if !CONFIG_HIGHWAY
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d8_v2_ssse3;
@@ -1437,4 +1436,4 @@ filter8_1dfunction aom_filter_block1d4_v2_ssse3;
FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
#endif // !CONFIG_HIGHWAY
-#endif // HAVE_AX2 && HAVE_SSSE3
+#endif // HAVE_AVX2 && HAVE_SSSE3
diff --git a/aom_dsp/x86/convolve_hwy_avx2.cc b/aom_dsp/x86/convolve_hwy_avx2.cc
new file mode 100644
index 0000000000..f247d1238c
--- /dev/null
+++ b/aom_dsp/x86/convolve_hwy_avx2.cc
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2026, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define HWY_BASELINE_TARGETS HWY_AVX2
+#define HWY_BROKEN_32BIT 0
+
+#include "aom_dsp/convolve_hwy.h"
+
+#if CONFIG_HIGHWAY
+CONVOLVE8HORIZ(avx2)
+#endif
diff --git a/aom_dsp/x86/convolve_hwy_avx512.cc b/aom_dsp/x86/convolve_hwy_avx512.cc
index 62557047ca..7d3ae9db64 100644
--- a/aom_dsp/x86/convolve_hwy_avx512.cc
+++ b/aom_dsp/x86/convolve_hwy_avx512.cc
@@ -14,33 +14,4 @@
#include "aom_dsp/convolve_hwy.h"
-extern "C" void aom_convolve8_horiz_avx2(const uint8_t *src,
- ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-extern "C" void aom_convolve8_horiz_avx512(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w, int h);
-
-HWY_ATTR void aom_convolve8_horiz_avx512(const uint8_t *src,
- ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- // 16x32, 32x32 and 64x32 blocks show ~10% slow down compared with avx2 with
- // significant speed up for all other blocks. Fall back to avx2 for wx32
- // blocks.
- // TODO: jianj - Investigate and optimize for wx32 blocks.
- if (h == 32) {
- aom_convolve8_horiz_avx2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- } else {
- HWY_NAMESPACE::Convolve8Horiz(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- }
-}
+CONVOLVE8HORIZ(avx512)