Commit f49e40dea3 for aom

commit f49e40dea3fe7f44969a9373e841ca29d86f349a
Author: Alex Davicenko <alex.davicenko@arm.com>
Date:   Mon Nov 10 13:55:00 2025 +0000

    Improve Arm highbd_convolve8_vert_8tap_sve

    Optimize the implementation of highbd_convolve8_vert_8tap_sve by:
     - Operating on 4x4 - rather than 8x4 - blocks, reducing loop-carried
       dependencies.
     - Using transpose_concat (ZIP1/2) for block shuffling instead of TBL2,
       removing lookup table index setup costs.

    Change-Id: I6012acadbef13967d2cec74a145fb42041576ac6

diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 1863aef951..8af5c5e6da 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -33,15 +33,6 @@ DECLARE_ALIGNED(16, const uint16_t, kDeinterleaveTbl[8]) = {
 };
 // clang-format on

-DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
-  // Shift left and insert new last column in transposed 4x4 block.
-  2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25,
-  // Shift left and insert two new columns in transposed 4x4 block.
-  4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27,
-  // Shift left and insert three new columns in transposed 4x4 block.
-  6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29
-};
-
 static inline uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
                                               uint16x4_t max) {
   int64x2_t sum[4];
@@ -278,34 +269,6 @@ void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }

-static inline void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
-                                  uint8x16_t tbl, int16x8_t res[4]) {
-  int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
-                           vreinterpretq_s8_s16(t1[0]) };
-  int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]),
-                           vreinterpretq_s8_s16(t1[1]) };
-  int8x16x2_t samples2 = { vreinterpretq_s8_s16(t0[2]),
-                           vreinterpretq_s8_s16(t1[2]) };
-  int8x16x2_t samples3 = { vreinterpretq_s8_s16(t0[3]),
-                           vreinterpretq_s8_s16(t1[3]) };
-
-  res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl));
-  res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
-  res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples2, tbl));
-  res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl));
-}
-
-static inline void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
-                                  uint8x16_t tbl, int16x8_t res[2]) {
-  int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
-                           vreinterpretq_s8_s16(t1[0]) };
-  int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]),
-                           vreinterpretq_s8_s16(t1[1]) };
-
-  res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl));
-  res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
-}
-
 static inline uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
                                               int16x8_t samples_hi[2],
                                               int16x8_t filter,
@@ -325,47 +288,17 @@ static inline uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
   return vmin_u16(res, max);
 }

-static inline uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4],
-                                              int16x8_t samples_hi[4],
-                                              int16x8_t filter,
-                                              uint16x8_t max) {
-  int64x2_t sum[4];
-
-  sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
-  sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1);
-
-  sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
-  sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1);
-
-  sum[2] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0);
-  sum[2] = aom_svdot_lane_s16(sum[2], samples_hi[2], filter, 1);
-
-  sum[3] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0);
-  sum[3] = aom_svdot_lane_s16(sum[3], samples_hi[3], filter, 1);
-
-  int32x4_t res0 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
-  int32x4_t res1 = vcombine_s32(vmovn_s64(sum[2]), vmovn_s64(sum[3]));
-
-  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
-                                vqrshrun_n_s32(res1, FILTER_BITS));
-
-  return vminq_u16(res, max);
-}
-
 static inline void highbd_convolve8_vert_8tap_sve(
     const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height,
     int bd) {
   const int16x8_t y_filter = vld1q_s16(filter_y);

-  uint8x16_t merge_block_tbl[3];
-  merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
-  merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
-  merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
-
-  if (width == 4) {
+  do {
     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
     int16_t *s = (int16_t *)src;
+    uint16_t *d = dst;
+    int h = height;

     int16x4_t s0, s1, s2, s3, s4, s5, s6;
     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -386,19 +319,17 @@ static inline void highbd_convolve8_vert_8tap_sve(
       int16x8_t s4567[2], s5678[2], s6789[2], s78910[2];

       // Transpose and shuffle the 4 lines that were loaded.
+      transpose_concat_elems_s16_4x4(s4, s5, s6, s7, s4567);
+      transpose_concat_elems_s16_4x4(s5, s6, s7, s8, s5678);
+      transpose_concat_elems_s16_4x4(s6, s7, s8, s9, s6789);
       transpose_concat_elems_s16_4x4(s7, s8, s9, s10, s78910);

-      // Merge new data into block from previous iteration.
-      aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567);
-      aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[1], s5678);
-      aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[2], s6789);
-
       uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, y_filter, max);
       uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, y_filter, max);
       uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, y_filter, max);
       uint16x4_t d3 = highbd_convolve8_4_v(s3456, s78910, y_filter, max);

-      store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);

       // Prepare block for next iteration - re-using as much as possible.
       // Shuffle everything up four rows.
@@ -411,81 +342,18 @@ static inline void highbd_convolve8_vert_8tap_sve(
       s3456[0] = s78910[0];
       s3456[1] = s78910[1];

-      s += 4 * src_stride;
-      dst += 4 * dst_stride;
-      height -= 4;
-    } while (height != 0);
-  } else {
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    do {
-      int h = height;
-      int16_t *s = (int16_t *)src;
-      uint16_t *d = dst;
-
-      int16x8_t s0, s1, s2, s3, s4, s5, s6;
-      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;

-      // This operation combines a conventional transpose and the sample permute
-      // required before computing the dot product.
-      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
-      transpose_concat_elems_s16_8x4(s0, s1, s2, s3, s0123);
-      transpose_concat_elems_s16_8x4(s1, s2, s3, s4, s1234);
-      transpose_concat_elems_s16_8x4(s2, s3, s4, s5, s2345);
-      transpose_concat_elems_s16_8x4(s3, s4, s5, s6, s3456);
-
-      do {
-        int16x8_t s7, s8, s9, s10;
-        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        int16x8_t s4567[4], s5678[4], s6789[4], s78910[4];
-
-        // Transpose and shuffle the 4 lines that were loaded.
-        transpose_concat_elems_s16_8x4(s7, s8, s9, s10, s78910);
-
-        // Merge new data into block from previous iteration.
-        aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567);
-        aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[1], s5678);
-        aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[2], s6789);
-
-        uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, y_filter, max);
-        uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, y_filter, max);
-        uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, y_filter, max);
-        uint16x8_t d3 = highbd_convolve8_8_v(s3456, s78910, y_filter, max);
-
-        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        // Prepare block for next iteration - re-using as much as possible.
-        // Shuffle everything up four rows.
-        s0123[0] = s4567[0];
-        s0123[1] = s4567[1];
-        s0123[2] = s4567[2];
-        s0123[3] = s4567[3];
-
-        s1234[0] = s5678[0];
-        s1234[1] = s5678[1];
-        s1234[2] = s5678[2];
-        s1234[3] = s5678[3];
-
-        s2345[0] = s6789[0];
-        s2345[1] = s6789[1];
-        s2345[2] = s6789[2];
-        s2345[3] = s6789[3];
-
-        s3456[0] = s78910[0];
-        s3456[1] = s78910[1];
-        s3456[2] = s78910[2];
-        s3456[3] = s78910[3];
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-      src += 8;
-      dst += 8;
-      width -= 8;
-    } while (width != 0);
-  }
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+    src += 4;
+    dst += 4;
+    width -= 4;
+  } while (width != 0);
 }

 void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,