Commit 242f885743 for aom
commit 242f885743f2c7957679c5ede8bef7d242b2124c
Author: Alex Davicenko <alex.davicenko@arm.com>
Date: Mon Jan 12 14:20:14 2026 +0000
Optimize Neon HBD sad and sad_skip implementation
Optimize Neon HBD sad and sad_skip implementation by increasing the u16
accumulator count from 2 to 4. Add separate implementation for 16-wide
SADs keeping 2 accumulators.
Change-Id: Iabd0aba59ad532cfffa8157a00f3c02552cd1f29
diff --git a/aom_dsp/arm/highbd_sad_neon.c b/aom_dsp/arm/highbd_sad_neon.c
index cb732cb071..b8dde45805 100644
--- a/aom_dsp/arm/highbd_sad_neon.c
+++ b/aom_dsp/arm/highbd_sad_neon.c
@@ -77,6 +77,46 @@ static inline uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
return horizontal_add_u32x4(sum_u32);
}
+static inline uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+
+ // 'h_overflow' is the number of 16-wide rows we can process before 16-bit
+ // accumulators overflow. After hitting this limit accumulate into 32-bit
+ // elements. 65535 / 4095 ~= 16, so 16 16-wide rows using two accumulators.
+ const int h_overflow = 16;
+ // If block height 'h' is smaller than this limit, use 'h' instead.
+ const int h_limit = h < h_overflow ? h : h_overflow;
+ assert(h % h_limit == 0);
+
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sum_u16[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ int i = h_limit;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr);
+ sum_u16[0] = vabaq_u16(sum_u16[0], s0, r0);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + 8);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr + 8);
+ sum_u16[1] = vabaq_u16(sum_u16[1], s1, r1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]);
+ h -= h_limit;
+ } while (h != 0);
+ return horizontal_add_u32x4(sum_u32);
+}
+
static inline uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -91,7 +131,8 @@ static inline uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
uint32x4_t sum_u32 = vdupq_n_u32(0);
do {
- uint16x8_t sum_u16[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
int i = h_limit;
do {
@@ -105,7 +146,15 @@ static inline uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8);
sum_u16[1] = vabaq_u16(sum_u16[1], s1, r1);
- j += 16;
+ uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16);
+ uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16);
+ sum_u16[2] = vabaq_u16(sum_u16[2], s2, r2);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24);
+ uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24);
+ sum_u16[3] = vabaq_u16(sum_u16[3], s3, r3);
+
+ j += 32;
} while (j < w);
src16_ptr += src_stride;
@@ -114,32 +163,22 @@ static inline uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]);
sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum_u16[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum_u16[3]);
h -= h_limit;
} while (h != 0);
return horizontal_add_u32x4(sum_u32);
}
-static inline uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h) {
- // 'h_overflow' is the number of 16-wide rows we can process before 16-bit
- // accumulators overflow. After hitting this limit accumulate into 32-bit
- // elements. 65535 / 4095 ~= 16, so 16 16-wide rows using two accumulators.
- const int h_overflow = 16;
- return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h,
- h_overflow);
-}
-
static inline uint32_t highbd_sad32xh_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h) {
// 'h_overflow' is the number of 32-wide rows we can process before 16-bit
// accumulators overflow. After hitting this limit accumulate into 32-bit
- // elements. 65535 / 4095 ~= 16, so 8 32-wide rows using two accumulators.
- const int h_overflow = 8;
+ // elements. 65535 / 4095 ~= 16, so 16 32-wide rows using four accumulators.
+ const int h_overflow = 16;
return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
h_overflow);
}
@@ -150,8 +189,8 @@ static inline uint32_t highbd_sad64xh_neon(const uint8_t *src_ptr,
int ref_stride, int h) {
// 'h_overflow' is the number of 64-wide rows we can process before 16-bit
// accumulators overflow. After hitting this limit accumulate into 32-bit
- // elements. 65535 / 4095 ~= 16, so 4 64-wide rows using two accumulators.
- const int h_overflow = 4;
+ // elements. 65535 / 4095 ~= 16, so 8 64-wide rows using four accumulators.
+ const int h_overflow = 8;
return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
h_overflow);
}
@@ -162,8 +201,8 @@ static inline uint32_t highbd_sad128xh_neon(const uint8_t *src_ptr,
int ref_stride, int h) {
// 'h_overflow' is the number of 128-wide rows we can process before 16-bit
// accumulators overflow. After hitting this limit accumulate into 32-bit
- // elements. 65535 / 4095 ~= 16, so 2 128-wide rows using two accumulators.
- const int h_overflow = 2;
+ // elements. 65535 / 4095 ~= 16, so 4 128-wide rows using four accumulators.
+ const int h_overflow = 4;
return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h,
h_overflow);
}