Commit b34e890c21 for openssl.org
commit b34e890c2104550c9a02b29eeb81da44240a2aac
Author: fengpengbo <feng.pengbo@zte.com.cn>
Date: Fri Dec 19 11:00:59 2025 +0800
This PR performs further optimization based on the already merged "Implement Montgomery multiplication assembly optimization for RV64GC" (#28012).
The key improvements include:code formatting unification、instruction scheduling optimization、register allocation strategy update、updates to addition/subtraction carry and borrow flag handling.
These enhancements aim to improve both the code maintainability and its execution performance, particularly on out-of-order RISC-V cores.
Reviewed-by: Neil Horman <nhorman@openssl.org>
Reviewed-by: Paul Dale <paul.dale@oracle.com>
(Merged from https://github.com/openssl/openssl/pull/29438)
diff --git a/crypto/bn/asm/riscv64-mont.pl b/crypto/bn/asm/riscv64-mont.pl
index b512dcfd01..af55d0c1c3 100644
--- a/crypto/bn/asm/riscv64-mont.pl
+++ b/crypto/bn/asm/riscv64-mont.pl
@@ -57,7 +57,7 @@ $output and open STDOUT,">$output";
$stack_offset -= 8;
$ret.=" sd $_,$stack_offset(sp)\n";
}
- return $ret;
+ return $ret;
}
sub load_regs {
my $ret = '';
@@ -70,7 +70,7 @@ $output and open STDOUT,">$output";
$stack_offset -= 8;
$ret.=" ld $_,$stack_offset(sp)\n";
}
- $ret.=" addi sp,sp,$stack_reservation\n";
+ $ret.=" addi sp,sp,$stack_reservation\n";
return $ret;
}
sub clear_regs {
@@ -84,12 +84,12 @@ $output and open STDOUT,">$output";
# Function arguments
# RISC-V ABI
-# $rp x10 a0 # BN_ULONG *rp
-# $ap x11 a1 # const BN_ULONG *ap
-# $bp x12 a2 # const BN_ULONG *bp
-# $np x13 a3 # const BN_ULONG *np
-# $n0 x14 a4 # const BN_ULONG *n0
-# $num x15 a5 # int num
+# $rp x10 a0 # BN_ULONG *rp
+# $ap x11 a1 # const BN_ULONG *ap
+# $bp x12 a2 # const BN_ULONG *bp
+# $np x13 a3 # const BN_ULONG *np
+# $n0 x14 a4 # const BN_ULONG *n0
+# $num x15 a5 # int num
my ($rp,$ap,$bp,$np,$n0,$num) = use_regs(10,11,12,13,14,15);
# Return address and Frame pointer
@@ -100,24 +100,24 @@ my ($ra,$fp) = use_regs(1,8);
# Temporary variable allocation
# RISC-V ABI
-# $lo0 x5 t0 the sum of partial products of a and b
-# $hi0 x6 t1 the high word of partial product of a and b + Carry
-# $aj x7 t2 ap[j]
-# $m0 x28 t3 bp[i]
-# $alo x29 t4 the low word of partial product
-# $ahi x30 t5 the high word of partial product
-# $lo1 x31 t6 partial product + reduction term
-# $hi1 x18 s2 the high word of reduction term + Carry
-# $nj x19 s3 np[j],modulus
-# $m1 x20 s4 montgomery reduction coefficient
-# $nlo x21 s5 the low word of reduction term
-# $nhi x22 s6 the high word of reduction term
-# $ovf x23 s7 highest carry bit,overflow flag
-# $i x24 s8 outer loop index
-# $j x25 s9 inner loop index
-# $tp x26 s10 temporary result storage
-# $tj x27 s11 tp[j],temporary result value
-# $temp x9 s1
+# $lo0 x5 t0 the sum of partial products of a and b
+# $hi0 x6 t1 the high word of partial product of a and b + Carry
+# $aj x7 t2 ap[j]
+# $m0 x28 t3 bp[i]
+# $alo x29 t4 the low word of partial product
+# $ahi x30 t5 the high word of partial product
+# $lo1 x31 t6 partial product + reduction term
+# $hi1 x18 s2 the high word of reduction term + Carry
+# $nj x19 s3 np[j],modulus
+# $m1 x20 s4 montgomery reduction coefficient
+# $nlo x21 s5 the low word of reduction term
+# $nhi x22 s6 the high word of reduction term
+# $ovf x23 s7 highest carry bit,overflow flag
+# $i x24 s8 outer loop index
+# $j x25 s9 inner loop index
+# $tp x26 s10 temporary result storage
+# $tj x27 s11 tp[j],temporary result value
+# $temp x9 s1
my ($lo0,$hi0,$aj,$m0,$alo,$ahi,$lo1,$hi1,$nj,$m1,$nlo,$nhi,$ovf,$i,$j,$tp,$tj,$temp) = use_regs(5..7,28..31,18..27,9);
# Carry variable
@@ -129,7 +129,7 @@ my $code .= <<___;
.text
.balign 32
.globl bn_mul_mont
-.type bn_mul_mont,\@function
+.type bn_mul_mont,\@function
bn_mul_mont:
___
@@ -139,26 +139,25 @@ $code .= <<___;
mv $fp, sp
___
-$code .= <<___;
+$code .= <<___;
ld $m0, 0($bp) # bp[0]
addi $bp, $bp,8
ld $hi0, 0($ap) # ap[0]
+ slli $num, $num, 3
+ sub $tp, sp, $num
ld $aj, 8($ap) # ap[1]
addi $ap, $ap, 16
ld $n0, 0($n0) # n0,precomputed modular inverse
+ andi $tp, $tp, -16 # address alignment
ld $hi1, 0($np) # np[0]
+ mv sp, $tp # alloca
ld $nj, 8($np) # np[1]
addi $np, $np, 16
- slli $num, $num, 3
- sub $tp, sp, $num
- andi $tp, $tp, -16 # address alignment
- mv sp, $tp # alloca
-
- addi $j, $num, -16 # $j=(num-2)*8
mul $lo0, $hi0, $m0 # ap[0]*bp[0]
- mulhu $hi0, $hi0, $m0
+ addi $j, $num, -16 # $j=(num-2)*8
+ mulhu $hi0, $hi0, $m0
mul $alo, $aj, $m0 # ap[1]*bp[0]
mulhu $ahi, $aj, $m0
@@ -166,12 +165,12 @@ $code .= <<___;
# montgomery optimization: np[0]*m1 ensures (np[0]*m1+lo0) has zero lower bits
# only carry status needed, not full lo1 result
# eliminates mul/adds instructions → Saves cycles & power
- # mul $lo1, $hi1, $m1 // np[0]*m1
+ # mul $lo1, $hi1, $m1 // np[0]*m1
# adds $lo1, $lo1, $lo0 // discarded
mulhu $hi1, $hi1, $m1
snez $carry1, $lo0
- add $hi1, $hi1, $carry1
mul $nlo, $nj, $m1 # np[1]*m1
+ add $hi1, $hi1, $carry1
mulhu $nhi, $nj, $m1
beqz $j, .L1st_last_entry
@@ -191,46 +190,43 @@ $code .= <<___;
# compute the sum of reduction term
add $lo1, $nlo, $hi1 # {np[j-1]*m1,low}+{np[j-2]*m1,high}, j ranges from 2 to num-1
sltu $carry1, $lo1, $nlo
+ mul $alo, $aj, $m0 # ap[j]*bp[0], j ranges from 2 to num-1
add $hi1, $nhi, $carry1 # {np[j-1]*m1,high}+C_lo1, j ranges from 2 to num-1
-
+ mulhu $ahi, $aj, $m0
# partial product + reduction term
- add $temp, $lo1, $lo0
- sltu $carry1, $temp, $lo1
- mv $lo1, $temp
+ add $lo1, $lo1, $lo0
+ sltu $carry1, $lo1, $lo0
+ mul $nlo, $nj, $m1 # np[j]*m1, j ranges from 2 to num-1
add $hi1, $hi1, $carry1
-
+ mulhu $nhi, $nj, $m1
sd $lo1, 0($tp) # tp[j-2], j ranges from 2 to num-1
addi $tp, $tp, 8
- mul $alo, $aj, $m0 # ap[j]*bp[0], j ranges from 2 to num-1
- mulhu $ahi, $aj, $m0
- mul $nlo, $nj, $m1 # np[j]*m1, j ranges from 2 to num-1
- mulhu $nhi, $nj, $m1
bnez $j, .L1st
.L1st_last_entry:
# last partial product
add $lo0, $alo, $hi0 # {ap[j]*bp[0],low}+{ap[j-1]*bp[0],high}, j is num-1
sltu $carry1, $lo0, $alo
- add $hi0, $ahi, $carry1 # {ap[j]*bp[0],high}+C_lo0, j is num-1
-
sub $ap, $ap, $num # rewind $ap
- sub $np, $np, $num # rewind $np
+ add $hi0, $ahi, $carry1 # {ap[j]*bp[0],high}+C_lo0, j is num-1
# last reduction term
add $lo1, $nlo, $hi1 # {np[j]*m1,low}+{np[j-1]*m1,high}, j is num-1
sltu $carry1, $lo1, $nlo
+ sub $np, $np, $num # rewind $np
add $hi1, $nhi, $carry1 # {np[j]*m1,high}+C_lo1, j is num-1
# last partial product + last reduction term
add $lo1, $lo1, $lo0
sltu $carry1, $lo1, $lo0
- add $temp, $hi1, $hi0
- sltu $carry2, $temp, $hi1
- add $hi1, $temp, $carry1
- sltu $ovf, $hi1, $temp
+ add $hi1, $hi1, $hi0
+ sltu $carry2, $hi1, $hi0
+ add $hi1, $hi1, $carry1
+ sltu $ovf, $hi1, $carry1
or $carry1, $carry2, $ovf # carry2 and ovf are mutually exclusive, both cannot be 1 simultaneously
+
mv $ovf, $carry1 # upmost overflow bit
addi $i, $num, -8 # $i=(num-1)*8
@@ -248,18 +244,16 @@ $code .= <<___;
addi $tp, sp, 8 # tp[1]
mul $lo0, $hi0, $m0 # ap[0]*bp[i], i ranges from 1 to num-1
+ addi $j, $num,-16 # $j=(num-2)*8
mulhu $hi0, $hi0, $m0
-
- addi $j, $num,-16 # $j=(num-2)*8
ld $hi1, 0($np)
ld $nj, 8($np)
addi $np, $np, 16
mul $alo, $aj, $m0 # ap[1]*bp[i], i ranges from 1 to num-1
- mulhu $ahi, $aj, $m0
-
add $lo0, $lo0, $tj # ap[0]*bp[i] + last_tp[0] , i ranges from 1 to num-1
sltu $carry1, $lo0, $tj
+ mulhu $ahi, $aj, $m0
add $hi0, $hi0, $carry1 # $hi0 will not overflow
# compute the modular reduction coefficient
@@ -267,8 +261,6 @@ $code .= <<___;
addi $i, $i, -8 # $i--, $i ranges from (num-1)*8 to 0
- # mul $lo1, $hi1, $m1 # discarded
- # adds $lo1, $lo1, $lo0 # discarded
mulhu $hi1, $hi1, $m1
snez $carry1, $lo0
mul $nlo, $nj, $m1 # np[1]*m1
@@ -299,20 +291,18 @@ $code .= <<___;
ld $nj, 0($np)
addi $np, $np, 8
+ mul $alo, $aj, $m0 # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1
# partial product + reduction term
add $lo0, $lo0, $tj
sltu $carry1, $lo0, $tj
+ mulhu $ahi, $aj, $m0
add $hi0, $hi0, $carry1
+ mul $nlo, $nj, $m1 # np[j]*m1, j ranges from 2 to num-1
add $lo1, $lo1, $lo0
sltu $carry1, $lo1, $lo0
-
- sd $lo1, -16($tp) # tp[j-2], j ranges from 2 to num-1
-
- mul $alo, $aj, $m0 # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1
- mulhu $ahi, $aj, $m0
- mul $nlo, $nj, $m1 # np[j]*m1, j ranges from 2 to num-1
mulhu $nhi, $nj, $m1
+ sd $lo1, -16($tp) # tp[j-2], j ranges from 2 to num-1
bnez $j, .Linner
@@ -324,19 +314,21 @@ $code .= <<___;
# last partial product
add $lo0, $alo, $hi0 # {ap[j]*bp[i],low}+{ap[j-1]*bp[i],high}, j is num-1, i ranges from 1 to num-1
sltu $carry1, $lo0, $alo
- add $hi0, $ahi, $carry1 # {ap[j]*bp[i],high}+C_lo0, j is num-1, i ranges from 1 to num-1
-
sub $ap, $ap, $num # rewind $ap
- sub $np, $np, $num # rewind $np
+ add $hi0, $ahi, $carry1 # {ap[j]*bp[i],high}+C_lo0, j is num-1, i ranges from 1 to num-1
# last reduction term
add $lo1, $nlo, $hi1 # {np[j]*m1,low}+{np[j-1]*m1,high}, j is num-1
sltu $carry1, $lo1, $nlo
- add $temp, $nhi, $ovf
- sltu $carry2, $temp, $nhi
- add $hi1, $temp, $carry1 # {np[j]*m1,high}+C_lo1, j is num-1
- sltu $ovf, $hi1, $temp
+
+ sub $np, $np, $num # rewind $np
+
+ add $hi1, $nhi, $ovf
+ sltu $carry2, $hi1, $ovf
+ add $hi1, $hi1, $carry1 # {np[j]*m1,high}+C_lo1, j is num-1
+ sltu $ovf, $hi1, $carry1
or $carry1, $carry2, $ovf
+
mv $ovf, $carry1 # update the upmost overflow bit
# last partial product + last reduction term
@@ -346,10 +338,11 @@ $code .= <<___;
add $lo1, $lo1, $lo0
sltu $carry1, $lo1, $lo0
- add $temp, $hi1, $hi0
- sltu $carry2, $temp, $hi1
- add $hi1, $temp, $carry1
- sltu $carry1, $hi1, $temp
+
+ add $hi1, $hi1, $hi0
+ sltu $carry2, $hi1, $hi0
+ add $hi1, $hi1, $carry1
+ sltu $carry1, $hi1, $carry1
or $carry1, $carry2, $carry1
add $ovf, $ovf, $carry1 # upmost overflow bit
@@ -363,18 +356,16 @@ $code .= <<___;
ld $nj, 0($np) # np[0]
addi $np, $np, 8
addi $j, $num, -8 # $j=(num-1)*8 and clear borrow
- sltu $carry1, $num, 8
- xori $carry1, $carry1, 1
+
+ li $carry1,0 # Custom, no borrow, C=0 (normal case, with borrow C=1)
mv $ap, $rp
.Lsub:
# tp[j]-np[j], j ranges from 0 to num-2, set carry flag
- xori $carry1, $carry1,1
sub $temp, $tj, $nj
sltu $carry2, $tj, $temp
sub $aj, $temp, $carry1
sltu $carry1, $temp, $aj
or $carry1, $carry2, $carry1
- xori $carry1, $carry1, 1
ld $tj, 0($tp) # tp[j], j ranges from 1 to num-1
addi $tp, $tp, 8
@@ -387,21 +378,15 @@ $code .= <<___;
bnez $j, .Lsub
# process the last word, tp[j]-np[j], j is num-1
- xori $carry1, $carry1,1
sub $temp, $tj, $nj
sltu $carry2, $tj, $temp
sub $aj, $temp, $carry1
sltu $carry1, $temp, $aj
or $carry1, $carry2, $carry1
- xori $carry1, $carry1, 1
-
+
# whether there is a borrow
- xori $carry1, $carry1, 1
- sub $temp, $ovf, zero
- sltu $carry2, $ovf, $temp
- sub $ovf, $temp, $carry1
- sltu $carry1, $temp, $ovf
- or $carry1, $carry2, $carry1
+ sub $temp, $ovf, $carry1
+ sltu $carry1, $ovf, $temp
xori $carry1, $carry1, 1
sd $aj, 0($ap) # rp[j], j is num-1
@@ -414,7 +399,6 @@ $code .= <<___;
addi $rp, $rp, 8
addi $num, $num, -8 # num--
nop
-
.Lcond_copy:
addi $num,$num, -8 # num--
# conditionally selects value based on borrow flag:
@@ -453,7 +437,7 @@ $code .= load_regs();
$code .= <<___;
ret
-.size bn_mul_mont,.-bn_mul_mont
+.size bn_mul_mont,.-bn_mul_mont
___
print $code;