Commit b34e890c21 for openssl.org

commit b34e890c2104550c9a02b29eeb81da44240a2aac
Author: fengpengbo <feng.pengbo@zte.com.cn>
Date:   Fri Dec 19 11:00:59 2025 +0800

    This PR performs further optimization based on the already merged "Implement Montgomery multiplication assembly optimization for RV64GC" (#28012).
    The key improvements include:code formatting unification、instruction scheduling optimization、register allocation strategy update、updates to addition/subtraction carry and borrow flag handling.
    These enhancements aim to improve both the code maintainability and its execution performance, particularly on out-of-order RISC-V cores.

    Reviewed-by: Neil Horman <nhorman@openssl.org>
    Reviewed-by: Paul Dale <paul.dale@oracle.com>
    (Merged from https://github.com/openssl/openssl/pull/29438)

diff --git a/crypto/bn/asm/riscv64-mont.pl b/crypto/bn/asm/riscv64-mont.pl
index b512dcfd01..af55d0c1c3 100644
--- a/crypto/bn/asm/riscv64-mont.pl
+++ b/crypto/bn/asm/riscv64-mont.pl
@@ -57,7 +57,7 @@ $output and open STDOUT,">$output";
             $stack_offset -= 8;
             $ret.="    sd      $_,$stack_offset(sp)\n";
         }
-	    return $ret;
+        return $ret;
     }
     sub load_regs {
         my $ret = '';
@@ -70,7 +70,7 @@ $output and open STDOUT,">$output";
             $stack_offset -= 8;
             $ret.="    ld      $_,$stack_offset(sp)\n";
         }
-	    $ret.="    addi    sp,sp,$stack_reservation\n";
+        $ret.="    addi    sp,sp,$stack_reservation\n";
         return $ret;
     }
     sub clear_regs {
@@ -84,12 +84,12 @@ $output and open STDOUT,">$output";

 # Function arguments
 #      RISC-V    ABI
-# $rp	x10	     a0  # BN_ULONG *rp
-# $ap	x11	     a1  # const BN_ULONG *ap
-# $bp	x12	     a2  # const BN_ULONG *bp
-# $np	x13	     a3  # const BN_ULONG *np
-# $n0	x14      a4  # const BN_ULONG *n0
-# $num	x15      a5  # int num
+# $rp   x10     a0  # BN_ULONG *rp
+# $ap   x11     a1  # const BN_ULONG *ap
+# $bp   x12     a2  # const BN_ULONG *bp
+# $np   x13     a3  # const BN_ULONG *np
+# $n0   x14     a4  # const BN_ULONG *n0
+# $num  x15     a5  # int num
 my ($rp,$ap,$bp,$np,$n0,$num) = use_regs(10,11,12,13,14,15);

 # Return address and Frame pointer
@@ -100,24 +100,24 @@ my ($ra,$fp) = use_regs(1,8);

 # Temporary variable allocation
 #      RISC-V    ABI
-# $lo0	x5	     t0    the sum of partial products of a and b
-# $hi0	x6	     t1    the high word of partial product of a and b + Carry
-# $aj	x7	     t2    ap[j]
-# $m0	x28	     t3    bp[i]
-# $alo	x29	     t4    the low word of partial product
-# $ahi	x30      t5    the high word of partial product
-# $lo1	x31	     t6    partial product + reduction term
-# $hi1	x18	     s2    the high word of reduction term + Carry
-# $nj	x19	     s3    np[j],modulus
-# $m1	x20	     s4    montgomery reduction coefficient
-# $nlo	x21	     s5    the low word of reduction term
-# $nhi	x22	     s6    the high word of reduction term
-# $ovf	x23	     s7    highest carry bit,overflow flag
-# $i	x24	     s8    outer loop index
-# $j	x25	     s9    inner loop index
-# $tp	x26	     s10   temporary result storage
-# $tj	x27	     s11   tp[j],temporary result value
-# $temp x9       s1
+# $lo0  x5     t0    the sum of partial products of a and b
+# $hi0  x6     t1    the high word of partial product of a and b + Carry
+# $aj   x7     t2    ap[j]
+# $m0   x28    t3    bp[i]
+# $alo  x29    t4    the low word of partial product
+# $ahi  x30    t5    the high word of partial product
+# $lo1  x31    t6    partial product + reduction term
+# $hi1  x18    s2    the high word of reduction term + Carry
+# $nj   x19    s3    np[j],modulus
+# $m1   x20    s4    montgomery reduction coefficient
+# $nlo  x21    s5    the low word of reduction term
+# $nhi  x22    s6    the high word of reduction term
+# $ovf  x23    s7    highest carry bit,overflow flag
+# $i    x24    s8    outer loop index
+# $j    x25    s9    inner loop index
+# $tp   x26    s10   temporary result storage
+# $tj   x27    s11   tp[j],temporary result value
+# $temp x9     s1
 my ($lo0,$hi0,$aj,$m0,$alo,$ahi,$lo1,$hi1,$nj,$m1,$nlo,$nhi,$ovf,$i,$j,$tp,$tj,$temp) = use_regs(5..7,28..31,18..27,9);

 # Carry variable
@@ -129,7 +129,7 @@ my $code .= <<___;
 .text
 .balign 32
 .globl bn_mul_mont
-.type   bn_mul_mont,\@function
+.type bn_mul_mont,\@function
 bn_mul_mont:
 ___

@@ -139,26 +139,25 @@ $code .= <<___;
     mv $fp, sp
 ___

-$code .= <<___;
+$code .= <<___;
     ld $m0, 0($bp)    # bp[0]
     addi $bp, $bp,8
     ld $hi0, 0($ap)    # ap[0]
+    slli $num, $num, 3
+    sub $tp, sp, $num
     ld $aj, 8($ap)    # ap[1]
     addi $ap, $ap, 16
     ld $n0, 0($n0)    # n0,precomputed modular inverse
+    andi $tp, $tp, -16    # address alignment
     ld $hi1, 0($np)    # np[0]
+    mv sp, $tp    # alloca
     ld $nj, 8($np)    # np[1]
     addi $np, $np, 16

-    slli $num, $num, 3
-    sub $tp, sp, $num
-    andi $tp, $tp, -16    # address alignment
-    mv sp, $tp    # alloca
-
-    addi $j, $num, -16    # $j=(num-2)*8

     mul $lo0, $hi0, $m0    # ap[0]*bp[0]
-    mulhu $hi0, $hi0, $m0
+    addi $j, $num, -16    # $j=(num-2)*8
+    mulhu $hi0, $hi0, $m0
     mul $alo, $aj, $m0    # ap[1]*bp[0]
     mulhu $ahi, $aj, $m0

@@ -166,12 +165,12 @@ $code .= <<___;
     # montgomery optimization: np[0]*m1 ensures (np[0]*m1+lo0) has zero lower bits
     # only carry status needed, not full lo1 result
     # eliminates mul/adds instructions → Saves cycles & power
-    # mul $lo1, $hi1, $m1		// np[0]*m1
+    # mul $lo1, $hi1, $m1   // np[0]*m1
     # adds $lo1, $lo1, $lo0   // discarded
     mulhu $hi1, $hi1, $m1
     snez $carry1, $lo0
-    add $hi1, $hi1, $carry1
     mul $nlo, $nj, $m1    # np[1]*m1
+    add $hi1, $hi1, $carry1
     mulhu $nhi, $nj, $m1
     beqz $j, .L1st_last_entry

@@ -191,46 +190,43 @@ $code .= <<___;
     # compute the sum of reduction term
     add $lo1, $nlo, $hi1    # {np[j-1]*m1,low}+{np[j-2]*m1,high}, j ranges from 2 to num-1
     sltu $carry1, $lo1, $nlo
+    mul $alo, $aj, $m0    # ap[j]*bp[0], j ranges from 2 to num-1
     add $hi1, $nhi, $carry1    # {np[j-1]*m1,high}+C_lo1, j ranges from 2 to num-1
-
+    mulhu $ahi, $aj, $m0
     # partial product + reduction term
-    add $temp, $lo1, $lo0
-    sltu $carry1, $temp, $lo1
-    mv $lo1, $temp
+    add $lo1, $lo1, $lo0
+    sltu $carry1, $lo1, $lo0
+    mul $nlo, $nj, $m1    # np[j]*m1, j ranges from 2 to num-1
     add $hi1, $hi1, $carry1
-
+    mulhu $nhi, $nj, $m1
     sd $lo1, 0($tp)    # tp[j-2], j ranges from 2 to num-1
     addi $tp, $tp, 8

-    mul $alo, $aj, $m0    # ap[j]*bp[0], j ranges from 2 to num-1
-    mulhu $ahi, $aj, $m0
-    mul $nlo, $nj, $m1    # np[j]*m1, j ranges from 2 to num-1
-    mulhu $nhi, $nj, $m1
     bnez $j, .L1st

 .L1st_last_entry:
     # last partial product
     add $lo0, $alo, $hi0    # {ap[j]*bp[0],low}+{ap[j-1]*bp[0],high}, j is num-1
     sltu $carry1, $lo0, $alo
-    add $hi0, $ahi, $carry1    # {ap[j]*bp[0],high}+C_lo0, j is num-1
-
     sub $ap, $ap, $num    # rewind $ap
-    sub $np, $np, $num    # rewind $np
+    add $hi0, $ahi, $carry1    # {ap[j]*bp[0],high}+C_lo0, j is num-1

     # last reduction term
     add $lo1, $nlo, $hi1    # {np[j]*m1,low}+{np[j-1]*m1,high}, j is num-1
     sltu $carry1, $lo1, $nlo
+    sub $np, $np, $num    # rewind $np
     add $hi1, $nhi, $carry1    # {np[j]*m1,high}+C_lo1, j is num-1

     # last partial product + last reduction term
     add $lo1, $lo1, $lo0
     sltu $carry1, $lo1, $lo0

-    add $temp, $hi1, $hi0
-    sltu $carry2, $temp, $hi1
-    add $hi1, $temp, $carry1
-    sltu $ovf, $hi1, $temp
+    add $hi1, $hi1, $hi0
+    sltu $carry2, $hi1, $hi0
+    add $hi1, $hi1, $carry1
+    sltu $ovf, $hi1, $carry1
     or $carry1, $carry2, $ovf    # carry2 and ovf are mutually exclusive, both cannot be 1 simultaneously
+
     mv $ovf, $carry1    # upmost overflow bit

     addi $i, $num, -8    # $i=(num-1)*8
@@ -248,18 +244,16 @@ $code .= <<___;
     addi $tp, sp, 8    # tp[1]

     mul $lo0, $hi0, $m0    # ap[0]*bp[i], i ranges from 1 to num-1
+    addi $j, $num,-16    # $j=(num-2)*8
     mulhu $hi0, $hi0, $m0
-
-    addi $j, $num,-16    # $j=(num-2)*8
     ld $hi1, 0($np)
     ld $nj, 8($np)
     addi $np, $np, 16

     mul $alo, $aj, $m0    # ap[1]*bp[i], i ranges from 1 to num-1
-    mulhu $ahi, $aj, $m0
-
     add $lo0, $lo0, $tj    # ap[0]*bp[i] + last_tp[0] , i ranges from 1 to num-1
     sltu $carry1, $lo0, $tj
+    mulhu $ahi, $aj, $m0
     add $hi0, $hi0, $carry1    # $hi0 will not overflow

     # compute the modular reduction coefficient
@@ -267,8 +261,6 @@ $code .= <<___;

     addi $i, $i, -8    # $i--, $i ranges from (num-1)*8 to 0

-    # mul $lo1, $hi1, $m1	 # discarded
-    # adds	$lo1, $lo1, $lo0   # discarded
     mulhu $hi1, $hi1, $m1
     snez $carry1, $lo0
     mul $nlo, $nj, $m1    # np[1]*m1
@@ -299,20 +291,18 @@ $code .= <<___;
     ld $nj, 0($np)
     addi $np, $np, 8

+    mul $alo, $aj, $m0    # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1
     # partial product + reduction term
     add $lo0, $lo0, $tj
     sltu $carry1, $lo0, $tj
+    mulhu $ahi, $aj, $m0
     add $hi0, $hi0, $carry1

+    mul $nlo, $nj, $m1    # np[j]*m1, j ranges from 2 to num-1
     add $lo1, $lo1, $lo0
     sltu $carry1, $lo1, $lo0
-
-    sd $lo1, -16($tp)    # tp[j-2], j ranges from 2 to num-1
-
-    mul $alo, $aj, $m0    # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1
-    mulhu $ahi, $aj, $m0
-    mul $nlo, $nj, $m1    # np[j]*m1, j ranges from 2 to num-1
     mulhu $nhi, $nj, $m1
+    sd $lo1, -16($tp)    # tp[j-2], j ranges from 2 to num-1

     bnez $j, .Linner

@@ -324,19 +314,21 @@ $code .= <<___;
     # last partial product
     add $lo0, $alo, $hi0    # {ap[j]*bp[i],low}+{ap[j-1]*bp[i],high}, j is num-1, i ranges from 1 to num-1
     sltu $carry1, $lo0, $alo
-    add $hi0, $ahi, $carry1    # {ap[j]*bp[i],high}+C_lo0, j is num-1, i ranges from 1 to num-1
-
     sub $ap, $ap, $num    # rewind $ap
-    sub	$np, $np, $num    # rewind $np
+    add $hi0, $ahi, $carry1    # {ap[j]*bp[i],high}+C_lo0, j is num-1, i ranges from 1 to num-1

     # last reduction term
     add $lo1, $nlo, $hi1    # {np[j]*m1,low}+{np[j-1]*m1,high}, j is num-1
     sltu $carry1, $lo1, $nlo
-    add $temp, $nhi, $ovf
-    sltu $carry2, $temp, $nhi
-    add $hi1, $temp, $carry1    # {np[j]*m1,high}+C_lo1, j is num-1
-    sltu $ovf, $hi1, $temp
+
+    sub $np, $np, $num    # rewind $np
+
+    add $hi1, $nhi, $ovf
+    sltu $carry2, $hi1, $ovf
+    add $hi1, $hi1, $carry1    # {np[j]*m1,high}+C_lo1, j is num-1
+    sltu $ovf, $hi1, $carry1
     or $carry1, $carry2, $ovf
+
     mv $ovf, $carry1    # update the upmost overflow bit

     # last partial product + last reduction term
@@ -346,10 +338,11 @@ $code .= <<___;

     add $lo1, $lo1, $lo0
     sltu $carry1, $lo1, $lo0
-    add $temp, $hi1, $hi0
-    sltu $carry2, $temp, $hi1
-    add $hi1, $temp, $carry1
-    sltu $carry1, $hi1, $temp
+
+    add $hi1, $hi1, $hi0
+    sltu $carry2, $hi1, $hi0
+    add $hi1, $hi1, $carry1
+    sltu $carry1, $hi1, $carry1
     or $carry1, $carry2, $carry1

     add $ovf, $ovf, $carry1    # upmost overflow bit
@@ -363,18 +356,16 @@ $code .= <<___;
     ld $nj, 0($np)    # np[0]
     addi $np, $np, 8
     addi $j, $num, -8    # $j=(num-1)*8 and clear borrow
-    sltu $carry1, $num, 8
-    xori $carry1, $carry1, 1
+
+    li $carry1,0   # Custom, no borrow, C=0 (normal case, with borrow C=1)
     mv $ap, $rp
 .Lsub:
     # tp[j]-np[j], j ranges from 0 to num-2, set carry flag
-    xori $carry1, $carry1,1
     sub $temp, $tj, $nj
     sltu $carry2, $tj, $temp
     sub $aj, $temp, $carry1
     sltu $carry1, $temp, $aj
     or $carry1, $carry2, $carry1
-    xori $carry1, $carry1, 1

     ld $tj, 0($tp)    # tp[j], j ranges from 1 to num-1
     addi $tp, $tp, 8
@@ -387,21 +378,15 @@ $code .= <<___;
     bnez $j, .Lsub

     # process the last word, tp[j]-np[j], j is num-1
-    xori $carry1, $carry1,1
     sub $temp, $tj, $nj
     sltu $carry2, $tj, $temp
     sub $aj, $temp, $carry1
     sltu $carry1, $temp, $aj
     or $carry1, $carry2, $carry1
-    xori $carry1, $carry1, 1
-
+
     # whether there is a borrow
-    xori $carry1, $carry1, 1
-    sub $temp, $ovf, zero
-    sltu $carry2, $ovf, $temp
-    sub $ovf, $temp, $carry1
-    sltu $carry1, $temp, $ovf
-    or $carry1, $carry2, $carry1
+    sub $temp, $ovf, $carry1
+    sltu $carry1, $ovf, $temp
     xori $carry1, $carry1, 1

     sd $aj, 0($ap)    # rp[j], j is num-1
@@ -414,7 +399,6 @@ $code .= <<___;
     addi $rp, $rp, 8
     addi $num, $num, -8    # num--
     nop
-
 .Lcond_copy:
     addi $num,$num, -8    # num--
     # conditionally selects value based on borrow flag:
@@ -453,7 +437,7 @@ $code .= load_regs();

 $code .= <<___;
     ret
-.size	bn_mul_mont,.-bn_mul_mont
+.size bn_mul_mont,.-bn_mul_mont
 ___

 print $code;