Commit 81faa2e6 for xz
commit 81faa2e653e81c305cc386c0a5280fef5924c9b7
Author: Lasse Collin <lasse.collin@tukaani.org>
Date: Tue May 19 17:36:15 2026 +0300
liblzma: ARM64 & LoongArch: Avoid aligned_readXXle on unaligned pointers
While most ARM64 and LoongArch processors support unaligned access,
the code wouldn't have worked on strict-align processors.
Use byte-by-byte method if the input buffer is at most 7 bytes. This
way the code works on strict-align processors too. It might not be
the very best method for tiny buffers, but it shouldn't matter in
practice (the simple implementation for big buffers isn't amazing
either because it cannot execute more than one CRC32 instruction
at a time on superscalar processors).
Reported-by: Guanni Qu
Fixes: 0ed893668554 ("liblzma: ARM64 CRC32: Align the buffer faster")
Fixes: 7baf6835cfbf ("liblzma: Speed up CRC32 calculation on 64-bit LoongArch")
diff --git a/src/liblzma/check/crc32_arm64.h b/src/liblzma/check/crc32_arm64.h
index cce1131b..4a0818f1 100644
--- a/src/liblzma/check/crc32_arm64.h
+++ b/src/liblzma/check/crc32_arm64.h
@@ -50,9 +50,15 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
{
crc = ~crc;
- if (size >= 8) {
- // Align the input buffer because this was shown to be
- // significantly faster than unaligned accesses.
+ if (size < 8) {
+ while (size > 0) {
+ crc = __crc32b(crc, *buf++);
+ --size;
+ }
+ } else {
+ // We have at least 8 bytes of input. Align the input buffer.
+ // Aligned is faster than unaligned access and works also on
+ // strict-align targets.
const size_t align = (0 - (uintptr_t)buf) & 7;
if (align & 1)
@@ -79,23 +85,21 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
buf < limit; buf += 8)
crc = __crc32d(crc, aligned_read64le(buf));
- size &= 7;
- }
+ // Process the remaining 0-7 bytes.
+ if (size & 4) {
+ crc = __crc32w(crc, aligned_read32le(buf));
+ buf += 4;
+ }
- // Process the remaining bytes that are not 8 byte aligned.
- if (size & 4) {
- crc = __crc32w(crc, aligned_read32le(buf));
- buf += 4;
- }
+ if (size & 2) {
+ crc = __crc32h(crc, aligned_read16le(buf));
+ buf += 2;
+ }
- if (size & 2) {
- crc = __crc32h(crc, aligned_read16le(buf));
- buf += 2;
+ if (size & 1)
+ crc = __crc32b(crc, *buf);
}
- if (size & 1)
- crc = __crc32b(crc, *buf);
-
return ~crc;
}
diff --git a/src/liblzma/check/crc32_loongarch.h b/src/liblzma/check/crc32_loongarch.h
index ec738b83..62cad852 100644
--- a/src/liblzma/check/crc32_loongarch.h
+++ b/src/liblzma/check/crc32_loongarch.h
@@ -21,7 +21,13 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc_unsigned)
{
int32_t crc = (int32_t)~crc_unsigned;
- if (size >= 8) {
+ if (size < 8) {
+ while (size > 0) {
+ crc = __crc_w_b_w((int8_t)*buf++, crc);
+ --size;
+ }
+ } else {
+ // We have at least 8 bytes of input. Align the input buffer.
const size_t align = (0 - (uintptr_t)buf) & 7;
if (align & 1)
@@ -39,26 +45,26 @@ crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc_unsigned)
size -= align;
+ // Process 8 bytes at a time.
for (const uint8_t *limit = buf + (size & ~(size_t)7);
buf < limit; buf += 8)
crc = __crc_w_d_w((int64_t)aligned_read64le(buf), crc);
- size &= 7;
- }
+ // Process the remaining 0-7 bytes.
+ if (size & 4) {
+ crc = __crc_w_w_w((int32_t)aligned_read32le(buf), crc);
+ buf += 4;
+ }
- if (size & 4) {
- crc = __crc_w_w_w((int32_t)aligned_read32le(buf), crc);
- buf += 4;
- }
+ if (size & 2) {
+ crc = __crc_w_h_w((int16_t)aligned_read16le(buf), crc);
+ buf += 2;
+ }
- if (size & 2) {
- crc = __crc_w_h_w((int16_t)aligned_read16le(buf), crc);
- buf += 2;
+ if (size & 1)
+ crc = __crc_w_b_w((int8_t)*buf, crc);
}
- if (size & 1)
- crc = __crc_w_b_w((int8_t)*buf, crc);
-
return (uint32_t)~crc;
}