Commit f49e40dea3 for aom
commit f49e40dea3fe7f44969a9373e841ca29d86f349a
Author: Alex Davicenko <alex.davicenko@arm.com>
Date: Mon Nov 10 13:55:00 2025 +0000
Improve Arm highbd_convolve8_vert_8tap_sve
Optimize the implementation of highbd_convolve8_vert_8tap_sve by:
- Operating on 4x4 - rather than 8x4 - blocks, reducing loop-carried
dependencies.
- Using transpose_concat (ZIP1/2) for block shuffling instead of TBL2,
removing lookup table index setup costs.
Change-Id: I6012acadbef13967d2cec74a145fb42041576ac6
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 1863aef951..8af5c5e6da 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -33,15 +33,6 @@ DECLARE_ALIGNED(16, const uint16_t, kDeinterleaveTbl[8]) = {
};
// clang-format on
-DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
- // Shift left and insert new last column in transposed 4x4 block.
- 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25,
- // Shift left and insert two new columns in transposed 4x4 block.
- 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27,
- // Shift left and insert three new columns in transposed 4x4 block.
- 6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29
-};
-
static inline uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
uint16x4_t max) {
int64x2_t sum[4];
@@ -278,34 +269,6 @@ void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
}
}
-static inline void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
- uint8x16_t tbl, int16x8_t res[4]) {
- int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
- vreinterpretq_s8_s16(t1[0]) };
- int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]),
- vreinterpretq_s8_s16(t1[1]) };
- int8x16x2_t samples2 = { vreinterpretq_s8_s16(t0[2]),
- vreinterpretq_s8_s16(t1[2]) };
- int8x16x2_t samples3 = { vreinterpretq_s8_s16(t0[3]),
- vreinterpretq_s8_s16(t1[3]) };
-
- res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl));
- res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
- res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples2, tbl));
- res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl));
-}
-
-static inline void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
- uint8x16_t tbl, int16x8_t res[2]) {
- int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
- vreinterpretq_s8_s16(t1[0]) };
- int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]),
- vreinterpretq_s8_s16(t1[1]) };
-
- res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl));
- res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
-}
-
static inline uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
int16x8_t samples_hi[2],
int16x8_t filter,
@@ -325,47 +288,17 @@ static inline uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
return vmin_u16(res, max);
}
-static inline uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4],
- int16x8_t samples_hi[4],
- int16x8_t filter,
- uint16x8_t max) {
- int64x2_t sum[4];
-
- sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
- sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1);
-
- sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
- sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1);
-
- sum[2] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0);
- sum[2] = aom_svdot_lane_s16(sum[2], samples_hi[2], filter, 1);
-
- sum[3] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0);
- sum[3] = aom_svdot_lane_s16(sum[3], samples_hi[3], filter, 1);
-
- int32x4_t res0 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
- int32x4_t res1 = vcombine_s32(vmovn_s64(sum[2]), vmovn_s64(sum[3]));
-
- uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
- vqrshrun_n_s32(res1, FILTER_BITS));
-
- return vminq_u16(res, max);
-}
-
static inline void highbd_convolve8_vert_8tap_sve(
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
int bd) {
const int16x8_t y_filter = vld1q_s16(filter_y);
- uint8x16_t merge_block_tbl[3];
- merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
- merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
- merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
-
- if (width == 4) {
+ do {
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
int16_t *s = (int16_t *)src;
+ uint16_t *d = dst;
+ int h = height;
int16x4_t s0, s1, s2, s3, s4, s5, s6;
load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -386,19 +319,17 @@ static inline void highbd_convolve8_vert_8tap_sve(
int16x8_t s4567[2], s5678[2], s6789[2], s78910[2];
// Transpose and shuffle the 4 lines that were loaded.
+ transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567);
+ transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678);
+ transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789);
transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s78910);
- // Merge new data into block from previous iteration.
- aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567);
- aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[1], s5678);
- aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[2], s6789);
-
uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, y_filter, max);
uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, y_filter, max);
uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, y_filter, max);
uint16x4_t d3 = highbd_convolve8_4_v(s3456, s78910, y_filter, max);
- store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
// Prepare block for next iteration - re-using as much as possible.
// Shuffle everything up four rows.
@@ -411,81 +342,18 @@ static inline void highbd_convolve8_vert_8tap_sve(
s3456[0] = s78910[0];
s3456[1] = s78910[1];
- s += 4 * src_stride;
- dst += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- } else {
- const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- do {
- int h = height;
- int16_t *s = (int16_t *)src;
- uint16_t *d = dst;
-
- int16x8_t s0, s1, s2, s3, s4, s5, s6;
- load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
- // This operation combines a conventional transpose and the sample permute
- // required before computing the dot product.
- int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
- transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
- transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
- transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
- transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
-
- do {
- int16x8_t s7, s8, s9, s10;
- load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- int16x8_t s4567[4], s5678[4], s6789[4], s78910[4];
-
- // Transpose and shuffle the 4 lines that were loaded.
- transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s78910);
-
- // Merge new data into block from previous iteration.
- aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567);
- aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[1], s5678);
- aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[2], s6789);
-
- uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, y_filter, max);
- uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, y_filter, max);
- uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, y_filter, max);
- uint16x8_t d3 = highbd_convolve8_8_v(s3456, s78910, y_filter, max);
-
- store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
- // Prepare block for next iteration - re-using as much as possible.
- // Shuffle everything up four rows.
- s0123[0] = s4567[0];
- s0123[1] = s4567[1];
- s0123[2] = s4567[2];
- s0123[3] = s4567[3];
-
- s1234[0] = s5678[0];
- s1234[1] = s5678[1];
- s1234[2] = s5678[2];
- s1234[3] = s5678[3];
-
- s2345[0] = s6789[0];
- s2345[1] = s6789[1];
- s2345[2] = s6789[2];
- s2345[3] = s6789[3];
-
- s3456[0] = s78910[0];
- s3456[1] = s78910[1];
- s3456[2] = s78910[2];
- s3456[3] = s78910[3];
-
- s += 4 * src_stride;
- d += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- src += 8;
- dst += 8;
- width -= 8;
- } while (width != 0);
- }
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ src += 4;
+ dst += 4;
+ width -= 4;
+ } while (width != 0);
}
void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,