Commit 6b0ed85d42 for openssl.org
commit 6b0ed85d4278afd605e1693c4a2165d327f32bb6
Author: WANG Xuerui <git@xen0n.name>
Date: Tue Aug 5 14:16:43 2025 +0800
LoongArch: Accelerate SHA-256 message scheduling with LSX
Detect and use LSX (128-bit SIMD) to accelerate the message scheduling
stage of SHA-256. The main compression round is not amenable to SIMD
optimizations because horizontal data dependencies are very heavy.
Due to data dependencies between x[n] and x[n-2] for rounds >= 16
(0-based), at most 2 SIMD lanes can be efficiently utilized, which
unfortunately means half of space is wasted with LSX (128-bit = 4 32-bit
lanes), and LASX (256-bit, 8 32-bit lanes) is meaningless. In the
current approach, 8 vector registers are used to store the 16 active
message schedule array elements; only vr[0] and vr[2] are meaningful for
each VR.
Performance numbers on Loongson 3C6000 (LA664 uarch) @ 2.2GHz:
Before:
```
version: 3.6.0-dev
built on: Sun Aug 3 10:22:36 2025 UTC
options: bn(64,64)
compiler: gcc -fPIC -pthread -Wa,--noexecstack -Wall -O3 -DOPENSSL_USE_NODELETE -DL_ENDIAN -DOPENSSL_PIC -DOPENSSL_BUILDING_OPENSSL -DNDEBUG
CPUINFO: N/A
The 'numbers' are in 1000s of bytes per second processed.
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
sha256 32676.25k 87543.16k 168349.71k 218342.74k 240099.57k 241639.34k
```
After:
```
version: 3.6.0-dev
built on: Sun Aug 3 10:22:36 2025 UTC
options: bn(64,64)
compiler: gcc -fPIC -pthread -Wa,--noexecstack -Wall -O3 -DOPENSSL_USE_NODELETE -DL_ENDIAN -DOPENSSL_PIC -DOPENSSL_BUILDING_OPENSSL -DNDEBUG
CPUINFO: N/A
The 'numbers' are in 1000s of bytes per second processed.
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
sha256 37180.32k 104604.94k 206861.06k 276741.65k 306465.73k 308863.06k
```
Which is an improvement of between 13.78% (for 16B inputs) and 27.82%
(for 16KiB inputs).
I have also experimented with a denser storage of message elements, with
4 VRs each storing 4 elements, but performance dropped by 0.25% (16B) to
2.42% (16KiB) because sigma1 must be computed twice with this storage
layout: once on x[14..15], once on the fresh x[16..17].
Signed-off-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <paul.dale@oracle.com>
Reviewed-by: Neil Horman <nhorman@openssl.org>
MergeDate: Wed Mar 11 13:56:15 2026
(Merged from https://github.com/openssl/openssl/pull/28192)
diff --git a/crypto/sha/asm/sha256-loongarch64.pl b/crypto/sha/asm/sha256-loongarch64.pl
index 2a050ba88d..3cba5ba258 100644
--- a/crypto/sha/asm/sha256-loongarch64.pl
+++ b/crypto/sha/asm/sha256-loongarch64.pl
@@ -42,6 +42,9 @@ use warnings;
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+my $use_lsx = $flavour && $flavour =~ /lsx/i ? 1 : 0;
+my $isaext = "_" . ( $use_lsx ? "lsx" : "la64v100" );
+
$output and open STDOUT,">$output";
my $code=<<___;
@@ -55,10 +58,13 @@ my ($zero,$ra,$tp,$sp,$fp)=("\$zero", "\$ra", "\$tp", "\$sp", "\$fp");
my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$a$_",(0..7));
my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8)=map("\$t$_",(0..8));
my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8)=map("\$s$_",(0..8));
+my ($va0, $va1, $va2, $va3, $va4, $va5, $va6, $va7) = map("\$vr$_",(0..7));
+my ($vt0, $vt1, $vt2, $vt3, $vt4, $vt5, $vt6, $vt7) = map("\$vr$_",(8..15));
my ($INP, $LEN, $ADDR) = ($a1, $a2, $sp);
my ($KT, $T1, $T2, $T3, $T4, $T5, $T6) = ($t0, $t1, $t2, $t3, $t4, $t5, $t6);
my ($A, $B, $C, $D, $E, $F, $G, $H) = ($s0, $s1, $s2, $s3, $s4, $s5, $s6, $s7);
+my @VMSGS = ($va0, $va1, $va2, $va3, $va4, $va5, $va6, $va7);
sub strip {
my ($str) = @_;
@@ -66,18 +72,109 @@ sub strip {
return $str;
}
+sub MSGSCHEDULE0_lsx {
+ my ($index) = @_;
+ my $msg = $VMSGS[$index / 2];
+ my $msg2 = $VMSGS[$index / 2 + 1];
+ my ($tmp0, $tmp1) = ($vt0, $vt1);
+ my $code;
+
+ if ($index % 4 == 0) {
+ $code = <<___;
+ vld $tmp0, $INP, @{[4*$index]}
+ vshuf4i.b $tmp0, $tmp0, 0b00011011 # 0123
+ vldi $msg2, 0
+ vilvl.w $msg, $msg2, $tmp0 # 0_1_
+ vilvh.w $msg2, $msg2, $tmp0 # 2_3_
+___
+ }
+
+ $code .= <<___;
+ vpickve2gr.w $T1, $msg, @{[($index%2)*2]}
+___
+
+ return strip($code);
+}
+
sub MSGSCHEDULE0 {
my ($index) = @_;
+
+ if ($use_lsx) {
+ return MSGSCHEDULE0_lsx($index);
+ }
+
my $code=<<___;
ld.w $T1, $INP, @{[4*$index]}
revb.2w $T1, $T1
st.w $T1, $ADDR, @{[4*$index]}
___
+
+ return strip($code);
+}
+
+sub MSGSCHEDULE1_lsx {
+ my ($index) = @_;
+ my $msgidx = ($index / 2) % 8;
+ my $m01 = $VMSGS[$msgidx];
+ my $m23 = $VMSGS[($msgidx + 1) % 8];
+ my $m45 = $VMSGS[($msgidx + 2) % 8];
+ my $m67 = $VMSGS[($msgidx + 3) % 8];
+ my $m89 = $VMSGS[($msgidx + 4) % 8];
+ my $mab = $VMSGS[($msgidx + 5) % 8];
+ my $mcd = $VMSGS[($msgidx + 6) % 8];
+ my $mef = $VMSGS[($msgidx + 7) % 8];
+ my ($m12, $tmp0, $tmp1) = ($vt0, $vt1, $vt2);
+ my $code;
+
+ if ($index % 2 == 0) {
+ # re-align to get $m12 and "$m9a" ($tmp0)
+ # $m01 += $m9a
+ $code = <<___;
+ # m01 & new = $m01, m23 = $m23, m45 = $m45, m67 = $m67
+ # m89 = $m89, mab = $mab, mcd = $mcd, mef = $mef
+ vbsrl.v $m12, $m01, 8 # 1___
+ vextrins.w $m12, $m23, 0b00100000 # 1_2_
+ vbsrl.v $tmp0, $m89, 8 # 9___
+ vextrins.w $tmp0, $mab, 0b00100000 # 9_a_
+ vadd.w $m01, $m01, $tmp0
+___
+
+ # $m01 += sigma0($m12)
+ $code .= <<___;
+ vrotri.w $tmp0, $m12, 7
+ vrotri.w $tmp1, $m12, 18
+ vsrli.w $m12, $m12, 3
+ vxor.v $tmp0, $tmp0, $tmp1
+ vxor.v $m12, $m12, $tmp0
+ vadd.w $m01, $m01, $m12
+___
+
+ # $m01 += sigma1($mef)
+ # now m1234 can be re-used as temporary
+ $code .= <<___;
+ vrotri.w $tmp0, $mef, 17
+ vrotri.w $tmp1, $mef, 19
+ vsrli.w $m12, $mef, 10
+ vxor.v $tmp0, $tmp0, $tmp1
+ vxor.v $m12, $m12, $tmp0
+ vadd.w $m01, $m01, $m12
+___
+ }
+
+ $code .= <<___;
+ vpickve2gr.w $T1, $m01, @{[($index%2)*2]}
+___
+
return strip($code);
}
sub MSGSCHEDULE1 {
my ($index) = @_;
+
+ if ($use_lsx) {
+ return MSGSCHEDULE1_lsx($index);
+ }
+
my $code=<<___;
ld.w $T1, $ADDR, @{[(($index-2)&0x0f)*4]}
ld.w $T2, $ADDR, @{[(($index-15)&0x0f)*4]}
@@ -152,12 +249,12 @@ ___
}
################################################################################
-# void sha256_block_data_order(void *c, const void *p, size_t len)
+# void sha256_block_data_order$isaext(void *c, const void *p, size_t len)
$code .= <<___;
.p2align 3
-.globl sha256_block_data_order
-.type sha256_block_data_order,\@function
-sha256_block_data_order:
+.globl sha256_block_data_order@{[$isaext]}
+.type sha256_block_data_order@{[$isaext]},\@function
+sha256_block_data_order@{[$isaext]}:
addi.d $sp, $sp, -80
@@ -171,9 +268,17 @@ sha256_block_data_order:
st.d $s7, $sp, 56
st.d $s8, $sp, 64
st.d $fp, $sp, 72
+___
+# SHA256 LSX needs neither dedicated shuffle control word, nor stack space for
+# internal states
+if (!$use_lsx) {
+ $code .= <<___;
addi.d $sp, $sp, -64
+___
+}
+$code .= <<___;
la $KT, $K256
# load ctx
@@ -238,9 +343,15 @@ $code .= <<___;
addi.d $INP, $INP, 64
bnez $LEN, L_round_loop
+___
+if (!$use_lsx) {
+ $code .= <<___;
addi.d $sp, $sp, 64
+___
+}
+$code .= <<___;
ld.d $s0, $sp, 0
ld.d $s1, $sp, 8
ld.d $s2, $sp, 16
@@ -255,7 +366,7 @@ $code .= <<___;
addi.d $sp, $sp, 80
ret
-.size sha256_block_data_order,.-sha256_block_data_order
+.size sha256_block_data_order@{[$isaext]},.-sha256_block_data_order@{[$isaext]}
.section .rodata
.p2align 3
diff --git a/crypto/sha/build.info b/crypto/sha/build.info
index f495fa7072..f7e7c6d175 100644
--- a/crypto/sha/build.info
+++ b/crypto/sha/build.info
@@ -18,7 +18,7 @@ IF[{- !$disabled{asm} -}]
$SHA1ASM_alpha=sha1-alpha.S
$SHA1DEF_alpha=SHA1_ASM
- $SHA1ASM_loongarch64=sha256-loongarch64.S sha512-loongarch64.S
+ $SHA1ASM_loongarch64=sha_loongarch.c sha256-loongarch64.S sha256-loongarch64-lsx.S sha512-loongarch64.S
$SHA1DEF_loongarch64=SHA256_ASM SHA512_ASM
$SHA1ASM_mips32=sha1-mips.S sha256-mips.S
@@ -138,6 +138,8 @@ GENERATE[sha512-parisc.s]=asm/sha512-parisc.pl
GENERATE[sha256-loongarch64.S]=asm/sha256-loongarch64.pl
INCLUDE[sha256-loongarch64.o]=..
+GENERATE[sha256-loongarch64-lsx.S]=asm/sha256-loongarch64.pl lsx
+INCLUDE[sha256-loongarch64-lsx.o]=..
GENERATE[sha512-loongarch64.S]=asm/sha512-loongarch64.pl
INCLUDE[sha512-loongarch64.o]=..
diff --git a/crypto/sha/sha_loongarch.c b/crypto/sha/sha_loongarch.c
new file mode 100644
index 0000000000..bade69f19f
--- /dev/null
+++ b/crypto/sha/sha_loongarch.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2025 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <openssl/opensslconf.h>
+#include <openssl/sha.h>
+#include "crypto/loongarch_arch.h"
+
+void sha256_block_data_order_la64v100(void *ctx, const void *in, size_t num);
+void sha256_block_data_order_lsx(void *ctx, const void *in, size_t num);
+void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num);
+
+void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num)
+{
+ if (OPENSSL_loongarch_hwcap_P & LOONGARCH_HWCAP_LSX) {
+ sha256_block_data_order_lsx(ctx, in, num);
+ } else {
+ sha256_block_data_order_la64v100(ctx, in, num);
+ }
+}