Commit 1257c12a2a for aom
commit 1257c12a2a889c9c3651df6b77895eca6c3b8e48
Author: Jerome Jiang <jianj@google.com>
Date: Thu Mar 26 21:40:55 2026 -0400
use unaligned load for av1_convolve_*_avx2
This is to avoid casting for unaligned memory
Bug: 496655355
Change-Id: Ib4d0981c8acda10665bf434cd63e81aaa45a5509
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index 0f829821a9..21a5a53503 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -54,6 +54,11 @@ static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
_mm_loadl_epi64((const __m128i *)hi));
}
+static inline void xx_storel_16(void *const a, const __m128i v) {
+ const uint16_t val = (uint16_t)_mm_cvtsi128_si32(v);
+ memcpy(a, &val, sizeof(val));
+}
+
static inline void xx_storel_32(void *const a, const __m128i v) {
const int val = _mm_cvtsi128_si32(v);
memcpy(a, &val, sizeof(val));
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index c2c3c7cb7a..6b82b5a686 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -131,16 +131,15 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
__m128i s[2], res;
if (w == 2) {
- s[0] = _mm_cvtsi32_si128(*(int16_t *)data);
+ s[0] = _mm_cvtsi32_si128(loadu_int16(data));
do {
- s[1] = _mm_cvtsi32_si128(*(int16_t *)(data + src_stride));
+ s[1] = _mm_cvtsi32_si128(loadu_int16(data + src_stride));
res = _mm_avg_epu8(s[0], s[1]);
- *(int16_t *)dst_ptr = (int16_t)_mm_cvtsi128_si32(res);
- s[0] = _mm_cvtsi32_si128(*(int16_t *)(data + 2 * src_stride));
+ xx_storel_16(dst_ptr, res);
+ s[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
res = _mm_avg_epu8(s[1], s[0]);
- *(int16_t *)(dst_ptr + dst_stride) =
- (int16_t)_mm_cvtsi128_si32(res);
+ xx_storel_16(dst_ptr + dst_stride, res);
data += 2 * src_stride;
dst_ptr += 2 * dst_stride;
@@ -638,14 +637,11 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
if (w - j > 2) {
- *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
- *(int *)&dst[i * dst_stride + j + dst_stride] =
- _mm_cvtsi128_si32(res_1);
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
} else {
- *(uint16_t *)&dst[i * dst_stride + j] =
- (uint16_t)_mm_cvtsi128_si32(res_0);
- *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
- (uint16_t)_mm_cvtsi128_si32(res_1);
+ xx_storel_16(&dst[i * dst_stride + j], res_0);
+ xx_storel_16(&dst[i * dst_stride + j + dst_stride], res_1);
}
}
s[0] = s[1];
@@ -1148,16 +1144,14 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
if (w > 2) {
// 00 01 02 03
- *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
+ xx_storel_32(&dst[i * dst_stride], res_0);
// 10 11 12 13
- *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
} else {
// 00 01
- *(uint16_t *)&dst[i * dst_stride] =
- (uint16_t)_mm_cvtsi128_si32(res_0);
+ xx_storel_16(&dst[i * dst_stride], res_0);
// 10 11
- *(uint16_t *)&dst[i * dst_stride + dst_stride] =
- (uint16_t)_mm_cvtsi128_si32(res_1);
+ xx_storel_16(&dst[i * dst_stride + dst_stride], res_1);
}
i += 2;
} while (i < h);
@@ -1206,8 +1200,8 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
__m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
- *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
- *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + 4], res_1);
j += 8;
} while (j < w);
@@ -1309,9 +1303,11 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
__m128i data = load_x_u8_4x2_sse4(src_ptr, src_stride);
const __m128i reg1 = _mm_srli_si128(data, 1);
const __m128i reg2 = _mm_avg_epu8(data, reg1);
- *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(reg2);
- *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(reg2, 2);
-
+ xx_storel_16(dst, reg2);
+ {
+ uint16_t val = (uint16_t)_mm_extract_epi16(reg2, 2);
+ memcpy(dst + dst_stride, &val, sizeof(val));
+ }
src_ptr += 2 * src_stride;
dst += 2 * dst_stride;
h -= 2;
@@ -1323,7 +1319,10 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
const __m128i reg1 = _mm_srli_si128(data, 1);
const __m128i reg2 = _mm_avg_epu8(data, reg1);
xx_storel_32(dst, reg2);
- *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(reg2, 2);
+ {
+ int32_t val = _mm_extract_epi32(reg2, 2);
+ memcpy(dst + dst_stride, &val, sizeof(val));
+ }
src_ptr += 2 * src_stride;
dst += 2 * dst_stride;