Commit 1257c12a2a for aom

commit 1257c12a2a889c9c3651df6b77895eca6c3b8e48
Author: Jerome Jiang <jianj@google.com>
Date:   Thu Mar 26 21:40:55 2026 -0400

    use unaligned load for av1_convolve_*_avx2

    This is to avoid casting for unaligned memory

    Bug: 496655355
    Change-Id: Ib4d0981c8acda10665bf434cd63e81aaa45a5509

diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index 0f829821a9..21a5a53503 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -54,6 +54,11 @@ static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) {
                             _mm_loadl_epi64((const __m128i *)hi));
 }

+static inline void xx_storel_16(void *const a, const __m128i v) {
+  const uint16_t val = (uint16_t)_mm_cvtsi128_si32(v);
+  memcpy(a, &val, sizeof(val));
+}
+
 static inline void xx_storel_32(void *const a, const __m128i v) {
   const int val = _mm_cvtsi128_si32(v);
   memcpy(a, &val, sizeof(val));
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index c2c3c7cb7a..6b82b5a686 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -131,16 +131,15 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
         __m128i s[2], res;

         if (w == 2) {
-          s[0] = _mm_cvtsi32_si128(*(int16_t *)data);
+          s[0] = _mm_cvtsi32_si128(loadu_int16(data));

           do {
-            s[1] = _mm_cvtsi32_si128(*(int16_t *)(data + src_stride));
+            s[1] = _mm_cvtsi32_si128(loadu_int16(data + src_stride));
             res = _mm_avg_epu8(s[0], s[1]);
-            *(int16_t *)dst_ptr = (int16_t)_mm_cvtsi128_si32(res);
-            s[0] = _mm_cvtsi32_si128(*(int16_t *)(data + 2 * src_stride));
+            xx_storel_16(dst_ptr, res);
+            s[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride));
             res = _mm_avg_epu8(s[1], s[0]);
-            *(int16_t *)(dst_ptr + dst_stride) =
-                (int16_t)_mm_cvtsi128_si32(res);
+            xx_storel_16(dst_ptr + dst_stride, res);

             data += 2 * src_stride;
             dst_ptr += 2 * dst_stride;
@@ -638,14 +637,11 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
           const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
           if (w - j > 2) {
-            *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
-            *(int *)&dst[i * dst_stride + j + dst_stride] =
-                _mm_cvtsi128_si32(res_1);
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
           } else {
-            *(uint16_t *)&dst[i * dst_stride + j] =
-                (uint16_t)_mm_cvtsi128_si32(res_0);
-            *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
-                (uint16_t)_mm_cvtsi128_si32(res_1);
+            xx_storel_16(&dst[i * dst_stride + j], res_0);
+            xx_storel_16(&dst[i * dst_stride + j + dst_stride], res_1);
           }
         }
         s[0] = s[1];
@@ -1148,16 +1144,14 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
         const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
         if (w > 2) {
           // 00 01 02 03
-          *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
+          xx_storel_32(&dst[i * dst_stride], res_0);
           // 10 11 12 13
-          *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
         } else {
           // 00 01
-          *(uint16_t *)&dst[i * dst_stride] =
-              (uint16_t)_mm_cvtsi128_si32(res_0);
+          xx_storel_16(&dst[i * dst_stride], res_0);
           // 10 11
-          *(uint16_t *)&dst[i * dst_stride + dst_stride] =
-              (uint16_t)_mm_cvtsi128_si32(res_1);
+          xx_storel_16(&dst[i * dst_stride + dst_stride], res_1);
         }
         i += 2;
       } while (i < h);
@@ -1206,8 +1200,8 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
           __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
           const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-          *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
-          *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
+          xx_storel_32(&dst[i * dst_stride + j], res_0);
+          xx_storel_32(&dst[i * dst_stride + j + 4], res_1);

           j += 8;
         } while (j < w);
@@ -1309,9 +1303,11 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
           __m128i data = load_x_u8_4x2_sse4(src_ptr, src_stride);
           const __m128i reg1 = _mm_srli_si128(data, 1);
           const __m128i reg2 = _mm_avg_epu8(data, reg1);
-          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(reg2);
-          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(reg2, 2);
-
+          xx_storel_16(dst, reg2);
+          {
+            uint16_t val = (uint16_t)_mm_extract_epi16(reg2, 2);
+            memcpy(dst + dst_stride, &val, sizeof(val));
+          }
           src_ptr += 2 * src_stride;
           dst += 2 * dst_stride;
           h -= 2;
@@ -1323,7 +1319,10 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
           const __m128i reg1 = _mm_srli_si128(data, 1);
           const __m128i reg2 = _mm_avg_epu8(data, reg1);
           xx_storel_32(dst, reg2);
-          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(reg2, 2);
+          {
+            int32_t val = _mm_extract_epi32(reg2, 2);
+            memcpy(dst + dst_stride, &val, sizeof(val));
+          }

           src_ptr += 2 * src_stride;
           dst += 2 * dst_stride;