Commit 708c14699b for aom
commit 708c14699b95c4b40db125c83c37a5040d17211e
Author: Alex Davicenko <alex.davicenko@arm.com>
Date: Thu Oct 23 16:40:31 2025 +0100
Improve Arm highbd_convolve_y_sr_12tap_sve2
Optimize the implementation of highbd_convolve_y_sr_12tap_sve2 by:
- Operating on 4x4 - rather than 8x4 - blocks, reducing loop-carried
dependencies.
- Using transpose_concat (ZIP1/2) for block shuffling instead of TBL2,
removing lookup table index setup costs.
Change-Id: Ic902982725883e0f2d87b8626fab87ba47d2b279
diff --git a/av1/common/arm/highbd_convolve_sve2.c b/av1/common/arm/highbd_convolve_sve2.c
index d50e2d0273..a455e8030b 100644
--- a/av1/common/arm/highbd_convolve_sve2.c
+++ b/av1/common/arm/highbd_convolve_sve2.c
@@ -429,21 +429,6 @@ static inline void highbd_convolve_y_sr_12tap_sve2(
const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4);
- uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kHbdDotProdMergeBlockTbl);
- // Scale indices by size of the true vector length to avoid reading from an
- // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
- uint16x8_t correction0 =
- vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL));
- merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0);
-
- uint16x8_t correction1 =
- vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
- merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1);
-
- uint16x8_t correction2 =
- vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL));
- merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2);
-
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
do {
@@ -472,14 +457,11 @@ static inline void highbd_convolve_y_sr_12tap_sve2(
load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
+ transpose_concat_elems_s16_4x4(s8, s9, sA, sB, s89AB);
+ transpose_concat_elems_s16_4x4(s9, sA, sB, sC, s9ABC);
+ transpose_concat_elems_s16_4x4(sA, sB, sC, sD, sABCD);
transpose_concat_elems_s16_4x4(sB, sC, sD, sE, sBCDE);
- // Use the above transpose and reuse data from the previous loop to get
- // the rest.
- aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB);
- aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC);
- aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD);
-
uint16x4_t d0 = highbd_convolve12_4_y(s0123, s4567, s89AB, y_filter_0_7,
y_filter_4_11, max);
uint16x4_t d1 = highbd_convolve12_4_y(s1234, s5678, s9ABC, y_filter_0_7,
@@ -510,6 +492,10 @@ static inline void highbd_convolve_y_sr_12tap_sve2(
s789A[0] = sBCDE[0];
s789A[1] = sBCDE[1];
+ s8 = sC;
+ s9 = sD;
+ sA = sE;
+
s += 4 * src_stride;
d += 4 * dst_stride;
h -= 4;