Commit ed002f9958 for openssl.org

commit ed002f99588db5f8f8658869a66fbf195f2a17ae
Author: fengpengbo <feng.pengbo@zte.com.cn>
Date:   Fri Feb 27 11:18:22 2026 +0800

    Add optimized Montgomery squaring for RV64GC

    This PR adds an RV64GC-optimized Montgomery squaring assembly, ported from the ARMv8 __bn_sqr8x_mont algorithm, complementing the earlier multiplication optimization (#28012).

    Performance Improvement

    | Test Item  | Baseline (C)  | ASM Optimized  | Improvement  |
    | ---------- | ------------- | -------------- | ------------ |
    | sign/s     | 208           | 220            | 5.77%        |
    | verify/s   | 7190          | 7956.6         | 10.66%       |
    | encr./s    | 6638          | 7156.0         | 7.8%         |
    | decr./s    | 203           | 216            | 6.4 %        |

    Reviewed-by: Neil Horman <nhorman@openssl.org>
    Reviewed-by: Paul Dale <paul.dale@oracle.com>
    MergeDate: Fri Mar 13 17:21:14 2026
    (Merged from https://github.com/openssl/openssl/pull/29440)

diff --git a/crypto/bn/asm/riscv64-mont.pl b/crypto/bn/asm/riscv64-mont.pl
index af55d0c1c3..f9db3c11eb 100644
--- a/crypto/bn/asm/riscv64-mont.pl
+++ b/crypto/bn/asm/riscv64-mont.pl
@@ -123,7 +123,7 @@ my ($lo0,$hi0,$aj,$m0,$alo,$ahi,$lo1,$hi1,$nj,$m1,$nlo,$nhi,$ovf,$i,$j,$tp,$tj,$
 # Carry variable
 # $carry1 x16      a6
 # $carry2 x17      a7
-my ($carry1,$carry2) = use_regs(16,17);
+my ($carry1,$carry2,$numtst) = use_regs(16,17,17);

 my $code .= <<___;
 .text
@@ -131,6 +131,9 @@ my $code .= <<___;
 .globl bn_mul_mont
 .type bn_mul_mont,\@function
 bn_mul_mont:
+    andi  $numtst, $num, 7
+    beqz  $numtst, bn_sqr8x_mont
+.Lmul_mont:
 ___

 $code .= save_regs();
@@ -157,7 +160,7 @@ $code .= <<___;

     mul $lo0, $hi0, $m0    # ap[0]*bp[0]
     addi $j, $num, -16    # $j=(num-2)*8
-    mulhu $hi0, $hi0, $m0
+    mulhu $hi0, $hi0, $m0
     mul $alo, $aj, $m0    # ap[1]*bp[0]
     mulhu $ahi, $aj, $m0

@@ -244,7 +247,7 @@ $code .= <<___;
     addi $tp, sp, 8    # tp[1]

     mul $lo0, $hi0, $m0    # ap[0]*bp[i], i ranges from 1 to num-1
-    addi $j, $num,-16    # $j=(num-2)*8
+    addi $j, $num,-16    # $j=(num-2)*8
     mulhu $hi0, $hi0, $m0
     ld $hi1, 0($np)
     ld $nj, 8($np)
@@ -291,7 +294,7 @@ $code .= <<___;
     ld $nj, 0($np)
     addi $np, $np, 8

-    mul $alo, $aj, $m0    # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1
+    mul $alo, $aj, $m0    # ap[j]*bp[i], j ranges from 2 to num-1, i ranges from 1 to num-1
     # partial product + reduction term
     add $lo0, $lo0, $tj
     sltu $carry1, $lo0, $tj
@@ -383,7 +386,7 @@ $code .= <<___;
     sub $aj, $temp, $carry1
     sltu $carry1, $temp, $aj
     or $carry1, $carry2, $carry1
-
+
     # whether there is a borrow
     sub $temp, $ovf, $carry1
     sltu $carry1, $ovf, $temp
@@ -440,5 +443,1437 @@ $code .= <<___;
 .size bn_mul_mont,.-bn_mul_mont
 ___

+{
+# Following is RISCV64 adaptation of __bn_sqr8x_mont from armv8-mont module.
+
+# Return address and Frame pointer
+#      RISC-V    ABI
+# $ra   x1       ra
+# $fp   x8       s0
+my ($ra,$fp) = use_regs(1,8);
+
+# Temporary variable allocation
+#      RISC-V    ABI
+# $a0   x5     t0
+# $a1   x6     t1
+# $a2   x7     t2
+# $a3   x28    t3
+# $a4   x9     s1
+# $a5   x18    s2
+# $a6   x19    s3
+# $a7   x20    s4
+my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7) = use_regs(5,6,7,28,9,18,19,20);
+
+# $t0   x16     a6
+# $t1   x17     a7
+# $t2   x29     t4
+# $t3   x30     t5
+my ($t0,$t1,$t2,$t3) = use_regs(16,17,29,30);
+
+# $acc0 x21    s5
+# $acc1 x22    s6
+# $acc2 x23    s7
+# $acc3 x24    s8
+# $acc4 x25    s9
+# $acc5 x31    t6
+# $acc6 x27    s11
+# $acc7 x26    s10
+my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = use_regs(21,22,23,24,25,31,27,26);
+
+# $temp   x14    a4<n0>
+# $carry1 x15    a5<num>
+# $carry2 x10    a0<rp>
+# $carry  x1     ra
+my ($temp, $carry1, $carry2, $carry) = use_regs(14,15,10,1);
+
+# $tp     x12    a2<bp>
+# $na0    x1     ra
+my ($tp,$na0) = use_regs(12,1);
+
+# Stack variables
+# rp       fp+96
+# np       fp+104
+# num      fp+112
+# n0       fp+120
+# ra       fp+128
+# cnt      fp+136
+# ap_end   fp+144
+# np_end   fp+152
+# topmost  fp+160
+
+# My_function
+sub adds {
+    # Simulate ARM 'adds':add with carry flag set
+    # (1) Final sum: dst = src1 + b, no input carry
+    # (2) Output carry, if src1 + b overflowed, carry = 1. src1 + b < b ? carry1=1 : carry1=0
+    my ($dst, $src1, $b) = @_;
+    my $code=<<___;
+    add $dst, $src1, $b
+    sltu $carry1, $dst, $b
+___
+    return $code;
+}
+
+sub adcs {
+    # Simulate ARM 'adcs': add with input carry and set output carry
+    # (1) Temp sum. dst = src1 + b, ignore input carry
+    # (2) Temp1 carry. if src1 + b overflowed, carry2 = 1. src1 + b < b ? carry2=1 : carry2=0
+    # (3) Final sum: dst += carry1 (input carry)
+    # (4) Temp2 carry. if adding input carry overflowed, carry1 = 1. dst + carry1 < carry1 ? carry1=1 : carry1=0
+    # (5) Final carry. carry1(carry_out) = carry2 | carry1
+    my ($dst, $src1, $b) = @_;
+    my $code=<<___;
+    add $dst, $src1, $b
+    sltu $carry2, $dst, $b
+    add $dst, $dst, $carry1
+    sltu $carry1, $dst, $carry1
+    or $carry1, $carry2, $carry1
+___
+    return $code;
+}
+
+sub adc {
+    # (1) Simulate ARM 'adc': add with input carry1, no flags
+    # (2) Temp sum: dst = src1 + b, ignore input carry
+    # (3) Final sum: dst += carry1 (input carry), ignore output carry
+    my ($dst, $src1, $b) = @_;
+    my $code=<<___;
+    add $dst,$src1,$b
+    add $dst,$dst,$carry1
+___
+    return $code;
+}
+
+sub extr {
+    # Simulate ARM 'extr': extract high bit and shift
+    # (1) carry2 = b >> 63 (extract highest bit of b)
+    # (2) dst = src1 << 1 (shift src1 left by 1, dst = src1*2)
+    # (3) dst |= carry2 (combine: insert b's high bit as dst's LSB, dst= carry2 + src1*2 )
+    my ($dst, $src1, $b) = @_;
+    my $code=<<___;
+    srli $carry2, $b, 63
+    slli $dst, $src1, 1
+    or   $dst, $dst, $carry2
+___
+    return $code;
+}
+
+sub subs{
+    # Simulate ARM 'subs': subtract with borrow flag set.
+    # (1) dst = src1 - b
+    # (2) if src1 < dst, set borrow flag,carry1 = 1
+    my ($dst, $src1, $b) = @_;
+    my $code=<<___;
+    sub $dst, $src1, $b
+    sltu $carry1, $src1, $dst
+___
+    return $code;
+}
+
+sub sbcs{
+    # Simulate ARM 'sbcs': subtract with input borrow (carry1) and set output borrow.
+    # (1) temp = src1 - b
+    # (2) if src1 < temp, set borrow flag, carry2 = 1
+    # (3) dst = temp - carry1 (borrow_in)
+    # (4) if temp < dst, set borrow flag, carry1 = 1
+    # (5) carry1 (borrow_out) = carry2 | carry1
+    my ($dst, $src1, $b) = @_;
+    my $code=<<___;
+    sub $temp, $src1, $b
+    sltu $carry2, $src1, $temp
+    sub $dst, $temp, $carry1
+    sltu $carry1, $temp, $dst
+    or $carry1, $carry2, $carry1
+___
+    return $code;
+}
+
+sub csel{
+    # Simulate ARM 'csel': conditional select based on carry
+    # Assumes carry1 is input condition, 1 if true/select b, 0 if false/select src1.
+    # dst = (carry1 == 0) ? src1 : b
+    # Uses bitwise ops for branchless execution
+    # (1) Normalize carry1: carry1 !=0 ? carry1 = 1 : carry1 = 0
+    # (2) Create mask: carry1 = 0 - carry1; yields -1 (all 1s) if 1 (true), 0 if false carry1 = -carry1 (0 or -1 mask)
+    # (3) Compute diff: dst = src1 ^ b (bitwise XOR highlights differing bits)
+    # (4) Mask diff: dst = diff & mask; yields diff if -1 (true), 0 if 0 (false)
+    # (5) Select: dst = src1 ^ dst; yields b if dst=diff (true), src1 if dst=0 (false)
+    my ($dst, $src1, $b) = @_;
+    my $code=<<___;
+    snez $carry1, $carry1
+    sub $carry1, zero, $carry1
+    xor $dst, $src1, $b
+    and $dst, $dst, $carry1
+    xor $dst, $src1, $dst
+___
+    return $code;
+}
+
+## store
+sub sd_rp{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,96($dst)
+___
+    return $code;
+}
+
+sub sd_np{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,104($dst)
+___
+    return $code;
+}
+
+sub sd_num{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,112($dst)
+___
+    return $code;
+}
+
+sub sd_n0{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,120($dst)
+___
+    return $code;
+}
+
+sub sd_ra{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,128($dst)
+___
+    return $code;
+}
+
+sub sd_cnt{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,136($dst)
+___
+    return $code;
+}
+
+sub sd_apend{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,144($dst)
+___
+    return $code;
+}
+
+sub sd_npend{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,152($dst)
+___
+    return $code;
+}
+
+sub sd_topmost{
+    my ($src1,$dst) = @_;
+    my $code=<<___;
+    sd $src1,160($dst)
+___
+    return $code;
+}
+
+## load
+sub ld_rp{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,96($src1)
+___
+    return $code;
+}
+
+sub ld_np{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,104($src1)
+___
+    return $code;
+}
+
+sub ld_num{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,112($src1)
+___
+    return $code;
+}
+
+sub ld_n0{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,120($src1)
+___
+    return $code;
+}
+
+sub ld_ra{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,128($src1)
+___
+    return $code;
+}
+
+sub ld_cnt{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,136($src1)
+___
+    return $code;
+}
+
+sub ld_apend{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,144($src1)
+___
+    return $code;
+}
+
+sub ld_npend{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,152($src1)
+___
+    return $code;
+}
+
+sub ld_topmost{
+    my ($dst,$src1) = @_;
+    my $code=<<___;
+    ld $dst,160($src1)
+___
+    return $code;
+}
+
+$code.=<<___;
+.type   bn_sqr8x_mont,%function
+.balign 32
+bn_sqr8x_mont:
+    # If ap != bp, -> normal multiplication, jump to .Lmul_mont
+    # If ap == bp, -> continue with optimized sqr8x path
+    bne $ap,$bp, .Lmul_mont
+___
+$code .= <<___;
+    addi    sp,sp,-168
+    sd      s0,88(sp)
+    sd      s1,80(sp)
+    sd      s2,72(sp)
+    sd      s3,64(sp)
+    sd      s4,56(sp)
+    sd      s5,48(sp)
+    sd      s6,40(sp)
+    sd      s7,32(sp)
+    sd      s8,24(sp)
+    sd      s9,16(sp)
+    sd      s11,8(sp)
+    sd      s10,0(sp)
+    mv $fp, sp
+___
+$code .= <<___;
+.Lsqr8x_mont:
+    ld $a0, 0($ap)    # load a[0-7]
+    ld $a1, 8($ap)
+    ld $a2, 16($ap)
+    ld $a3, 24($ap)
+    ld $a4, 32($ap)
+    ld $a5, 40($ap)
+    ld $a6, 48($ap)
+    ld $a7, 56($ap)
+
+    slli $t0, $num, 4
+    sub $tp, sp, $t0    # alloca sp-num*16
+    slli $num, $num, 3
+    ld $n0, 0($n0)    # n0, precomputed modular inverse
+    mv sp, $tp    # alloca
+
+    # Save temporary values to stack to release registers for subsequent operations
+    addi $t1,$num,-64    # number of bytes remaining to be zeroed
+    @{[sd_cnt $t1,$fp]}    # offload cnt
+    @{[sd_rp $rp,$fp]}    # offload rp
+    @{[sd_np $np,$fp]}    # offload np
+    @{[sd_num $num,$fp]}    # offload num
+    @{[sd_n0 $n0,$fp]}    # offload n0
+    @{[sd_ra $ra,$fp]}    # offload ra
+
+    j .Lsqr8x_zero_start
+
+
+    # Clear tp buffer to store partial products
+    # Zero 128 bytes per iteration until cache cleared
+.Lsqr8x_zero:
+    addi $carry2,$carry2,-64    # cnt=cnt-64
+    @{[sd_cnt $carry2,$fp]}
+
+    sd zero, 0($tp)
+    sd zero, 8($tp)
+    sd zero, 16($tp)
+    sd zero, 24($tp)
+    sd zero, 32($tp)
+    sd zero, 40($tp)
+    sd zero, 48($tp)
+    sd zero, 56($tp)
+
+.Lsqr8x_zero_start:
+    sd zero, 64($tp)
+    sd zero, 72($tp)
+    sd zero, 80($tp)
+    sd zero, 88($tp)
+    sd zero, 96($tp)
+    sd zero, 104($tp)
+    sd zero, 112($tp)
+    sd zero, 120($tp)
+    addi $tp,$tp,128
+
+    @{[ld_cnt $carry2,$fp]}
+    bnez $carry2,.Lsqr8x_zero    # cnt != 0 -> Lsqr8x_zero
+
+    @{[ld_num $temp,$fp]}
+    add $temp,$ap,$temp    # ap_end = ap + num
+    @{[sd_apend $temp,$fp]} # offload ap_end, the last element of ap
+    addi $ap,$ap,64    # ap += 64, skip initial 8 64-bit words
+    mv $acc0, zero
+    mv $acc1, zero
+    mv $acc2, zero
+    mv $acc3, zero
+    mv $acc4, zero
+    mv $acc5, zero
+    mv $acc6, zero
+    mv $acc7, zero
+    mv $tp,sp    # points to buffer start
+
+    # Multiply everything but a[i]*a[i]
+.balign 16
+.Lsqr8x_outer_loop:
+    # Compute cross products: a[j] * a[i] for all < j
+    #                                                 a[1]a[0]  (i)
+    #                                             a[2]a[0]
+    #                                         a[3]a[0]
+    #                                     a[4]a[0]
+    #                                 a[5]a[0]
+    #                             a[6]a[0]
+    #                         a[7]a[0]
+    #                                         a[2]a[1]  (ii)
+    #                                     a[3]a[1]
+    #                                 a[4]a[1]
+    #                             a[5]a[1]
+    #                         a[6]a[1]
+    #                     a[7]a[1]
+    #                                 a[3]a[2]  (iii)
+    #                             a[4]a[2]
+    #                         a[5]a[2]
+    #                     a[6]a[2]
+    #                 a[7]a[2]
+    #                         a[4]a[3]  (iv)
+    #                     a[5]a[3]
+    #                 a[6]a[3]
+    #             a[7]a[3]
+    #                 a[5]a[4]  (v)
+    #             a[6]a[4]
+    #         a[7]a[4]
+    #         a[6]a[5]  (vi)
+    #     a[7]a[5]
+    # a[7]a[6]  (vii)
+    mul $t0,$a1,$a0    # lo(a[1..7]*a[0])  (i)
+    mul $t1,$a2,$a0
+    mul $t2,$a3,$a0
+    mul $t3,$a4,$a0
+
+    @{[adds $acc1, $acc1, $t0]}    # t[1]+lo(a[1]*a[0])
+    mul $t0,$a5,$a0
+    @{[adcs $acc2, $acc2, $t1]}    # t[2]+lo(a[2]*a[0])
+    mul $t1,$a6,$a0
+    @{[adcs $acc3, $acc3, $t2]}    # t[3]+lo(a[3]*a[0])
+    mul $t2,$a7,$a0
+    @{[adcs $acc4, $acc4, $t3]}    # t[4]+lo(a[4]*a[0])
+
+    mulhu $t3,$a1,$a0    # hi(a[1..7]*a[0])
+    @{[adcs $acc5, $acc5, $t0]}    # t[5]+lo(a[5]*a[0])
+    mulhu $t0,$a2,$a0
+    @{[adcs $acc6, $acc6, $t1]}    # t[6]+lo(a[6]*a[0])
+    mulhu $t1,$a3,$a0
+    @{[adcs $acc7, $acc7, $t2]}    # t[7]+lo(a[7]*a[0])
+    mulhu $t2,$a4,$a0
+    sd $acc0, 0($tp)    # offload acc0, store t[0]
+    add $acc0,zero,$carry1    # t[8]=0+c_pre{t[7]+lo(a[7]*a[0])}
+    sd $acc1, 8($tp)    # offload acc1, store t[1]
+    addi $tp,$tp,16    # tp=tp+16,advance buffer pointer by 16 bytes
+
+    @{[adds $acc2, $acc2, $t3]}    # t[2]+hi(a[1]*a[0])
+    mulhu $t3,$a5,$a0
+    @{[adcs $acc3, $acc3, $t0]}    # t[3]+hi(a[2]*a[0])
+    mulhu $t0,$a6,$a0
+    @{[adcs $acc4, $acc4, $t1]}    # t[4]+hi(a[3]*a[0])
+    mulhu $t1,$a7,$a0
+    @{[adcs $acc5, $acc5, $t2]}    # t[5]+hi(a[4]*a[0])
+
+    mul $t2,$a2,$a1    # lo(a[2..7]*a[1])  (ii)
+    @{[adcs $acc6, $acc6, $t3]}    # t[6]+hi(a[5]*a[0])
+    mul $t3,$a3,$a1
+    @{[adcs $acc7, $acc7, $t0]}    # t[7]+hi(a[6]*a[0])
+    mul $t0,$a4,$a1
+    @{[adc $acc0, $acc0, $t1]}    # t[8]+hi(a[7]*a[0])
+    mul $t1,$a5,$a1
+
+    @{[adds $acc3, $acc3, $t2]}    # t[3]+ lo(a[2]*a[1])
+    mul $t2,$a6,$a1
+    @{[adcs $acc4, $acc4, $t3]}    # t[4]+ lo(a[3]*a[1])
+    mul $t3,$a7,$a1
+    @{[adcs $acc5, $acc5, $t0]}    # t[5]+ lo(a[4]*a[1])
+    mulhu $t0,$a2,$a1    # hi(a[2..7]*a[1])
+    @{[adcs $acc6,$acc6, $t1]}    # t[6]+ lo(a[5]*a[1])
+    mulhu $t1,$a3,$a1
+    @{[adcs $acc7, $acc7, $t2]}    # t[7]+ lo(a[6]*a[1])
+    mulhu $t2,$a4,$a1
+    @{[adcs $acc0, $acc0, $t3]}    # t[8]+ lo(a[7]*a[1])
+    mulhu $t3,$a5,$a1
+    sd $acc2,0($tp)    # offload acc2, store t[2]
+    add $acc1,zero,$carry1    # t[9]=0+c_pre{t[8]+ lo(a[7]*a[1])}
+    sd $acc3,8($tp)    # offload acc3, store t[3]
+    addi $tp,$tp,16    # tp=tp+16,advance buffer pointer by 16 bytes
+
+    @{[adds $acc4, $acc4, $t0]}    # t[4]+ hi(a[2]*a[1])
+    mulhu $t0,$a6,$a1
+    @{[adcs $acc5, $acc5, $t1]}    # t[5]+ hi(a[3]*a[1])
+    mulhu $t1,$a7,$a1
+    @{[adcs $acc6, $acc6, $t2]}    # t[6]+ hi(a[4]*a[1])
+    mul $t2,$a3,$a2    # lo(a[3..7]*a[2])  (iii)
+    @{[adcs $acc7, $acc7, $t3]}    # t[7]+ hi(a[5]*a[1])
+    mul $t3,$a4,$a2
+    @{[adcs $acc0, $acc0, $t0]}    # t[8]+ hi(a[6]*a[1])
+    mul $t0,$a5,$a2
+    @{[adc  $acc1, $acc1, $t1]}    # t[9]+ hi(a[7]*a[1])
+    mul $t1,$a6,$a2
+    @{[adds $acc5, $acc5, $t2]}    # t[5]+ lo(a[3]*a[2])
+    mul $t2,$a7,$a2
+    @{[adcs $acc6, $acc6, $t3]}    # t[6]+ lo(a[4]*a[2])
+    mulhu $t3,$a3,$a2    # hi(a[3..7]*a[2])
+    @{[adcs $acc7, $acc7, $t0]}    # t[7]+ lo(a[5]*a[2])
+    mulhu $t0,$a4,$a2
+    @{[adcs $acc0, $acc0, $t1]}    # t[8]+ lo(a[6]*a[2])
+    mulhu $t1,$a5,$a2
+    @{[adcs $acc1, $acc1, $t2]}    # t[9]+ lo(a[7]*a[2])
+    mulhu $t2,$a6,$a2
+    sd $acc4,0($tp)    # offload acc4, store t[4]
+    add $acc2,zero,$carry1    # t[10]=0+c_pre{t[9]+ lo(a[7]*a[2])}
+    sd $acc5,8($tp)    # offload acc5, store t[5]
+    addi $tp,$tp,16    # tp=tp+16,advance buffer pointer by 16 bytes
+
+    @{[adds $acc6, $acc6, $t3]}    # t[6]+ hi(a[3]*a[2])
+    mulhu $t3,$a7,$a2
+    @{[adcs $acc7, $acc7, $t0]}    # t[7]+ hi(a4]*a[2])
+    mul $t0,$a4,$a3  # lo(a[4..7]*a[3])  (iv)
+    @{[adcs $acc0, $acc0, $t1]}    # t[8]+ hi(a5]*a[2])
+    mul $t1,$a5,$a3
+    @{[adcs $acc1, $acc1, $t2]}    # t[9]+ hi(a6]*a[2])
+    mul $t2,$a6,$a3
+    @{[adc  $acc2, $acc2, $t3]}    # t[10]+ hi(a7]*a[2])
+    mul $t3,$a7,$a3
+
+    @{[adds $acc7, $acc7, $t0]}    # t[7]+ lo(a[4]*a[3])
+    mulhu $t0,$a4,$a3   # hi(a[4..7]*a[3])
+    @{[adcs $acc0, $acc0, $t1]}    # t[8]+ lo(a[5]*a[3])
+    mulhu $t1,$a5,$a3
+    @{[adcs $acc1, $acc1, $t2]}    # t[9]+ lo(a[6]*a[3])
+    mulhu $t2,$a6,$a3
+    @{[adcs $acc2, $acc2, $t3]}    # t[10]+ lo(a[7]*a[3])
+    mulhu $t3,$a7,$a3
+    sd $acc6,0($tp)    # offload acc6, store t[6]
+    add $acc3,zero,$carry1    # t[11]=0+c_pre{t[10]+ lo(a[7]*a[3])}
+    sd $acc7,8($tp)    # offload acc7, store t[7]
+    addi $tp,$tp,16    # tp=tp+16, advance buffer pointer by 16 bytes
+
+    @{[adds $acc0, $acc0, $t0]}    # t[8]+ hi(a[4]*a[3])
+    mul $t0,$a5,$a4    # lo(a[5..7]*a[4])  (v)
+    @{[adcs $acc1, $acc1, $t1]}    # t[9]+ hi(a[5]*a[3])
+    mul $t1,$a6,$a4
+    @{[adcs $acc2, $acc2, $t2]}    # t[10]+ hi(a[6]*a[3])
+    mul $t2,$a7,$a4
+    @{[adc  $acc3, $acc3, $t3]}    # t[11]+ hi(a[7]*a[3])
+
+    mulhu $t3,$a5,$a4    # hi(a[5..7]*a[4])
+    @{[adds $acc1, $acc1, $t0]}    # t[9]+ lo(a[5]*a[4])
+    mulhu $t0,$a6,$a4
+    @{[adcs $acc2, $acc2, $t1]}    # t[10]+ lo(a[6]*a[4])
+    mulhu $t1,$a7,$a4
+    @{[adcs $acc3, $acc3, $t2]}    # t[11]+ lo(a[7]*a[4])
+    mul $t2,$a6,$a5    # lo(a[6..7]*a[5])  (vi)
+    add $acc4,zero,$carry1    # t[12]=0+c_pre{t[11]+ lo(a[7]*a[4])}
+
+    @{[adds $acc2, $acc2, $t3]}    # t[10]+ hi(a[5]*a[4])
+    mul $t3,$a7,$a5
+    @{[adcs $acc3, $acc3, $t0]}    # t[11]+ hi(a[6]*a[4])
+    mulhu $t0,$a6,$a5    # hi(a[6..7]*a[5])
+    @{[adc $acc4, $acc4, $t1]}    # t[12]+ hi(a[7]*a[4])
+    mulhu $t1,$a7,$a5
+
+    @{[adds $acc3, $acc3, $t2]}    # t[11]+ lo(a[6]*a[5])
+    mul $t2,$a7,$a6    # lo(a[7]*a[6])  (vii)
+    @{[adcs $acc4,$acc4, $t3]}    # t[12]+ lo(a[7]*a[5])
+    mulhu $t3,$a7,$a6    # hi(a[7]*a[6])
+    add $acc5,zero,$carry1    # t[13] =0+c_pre{t[12]+lo(a[7]*a[5])}
+
+    @{[adds $acc4, $acc4, $t0]}    # t[12]+ hi(a[6]*a[5])
+
+    @{[ld_apend $temp,$fp]} # load ap_end
+    sub $temp,$temp,$ap    # cnt = ap_end-ap, done yet?
+    @{[sd_cnt $temp,$fp]}
+
+    @{[adc $acc5, $acc5, $t1]}    # t[13]+ hi(a[7]*a[5])
+
+    @{[adds $acc5, $acc5, $t2]}     # t[13]+ lo(a[7]*a[6])
+    @{[ld_num $temp,$fp]}
+    @{[ld_apend $carry2,$fp]}
+    sub $t0, $carry2, $temp    # rewinded ap
+    add $acc6,zero,$carry1    # t[14]=0+c_pre{t[13]+ lo(a[7]*a[6])}
+    add $acc6,$acc6,$t3    # t[14] + hi(a[7]*a[6])
+
+    # Check if we have processed all elements of a:
+    # If no remaining elements, jump to next step Lsqr8x_outer_break
+    # Otherwise, load previous partial product from temp buffer,
+    # add new a product, and continue inner loop .Lsqr8x_mul.
+    @{[ld_cnt $temp,$fp]}  # load cnt, cnt = ap_end-ap
+    beqz $temp, .Lsqr8x_outer_break
+
+    mv $temp,$a0    # a0->temp,reuse register'temp'
+    # loads next batch of data from temporary buffer
+    ld $a0,0($tp)
+    ld $a1,8($tp)
+    ld $a2,16($tp)
+    ld $a3,24($tp)
+    ld $a4,32($tp)
+    ld $a5,40($tp)
+    ld $a6,48($tp)
+    ld $a7,56($tp)
+
+    # accumulate new data
+    @{[adds $acc0, $acc0, $a0]}    # t[8]~t[14]...
+    ld $a0,0($ap)    # load new data, a[8..15]...
+    @{[adcs $acc1, $acc1, $a1]}
+    ld $a1,8($ap)
+    @{[adcs $acc2, $acc2, $a2]}
+    ld $a2,16($ap)
+    @{[adcs $acc3, $acc3, $a3]}
+    ld $a3,24($ap)
+    @{[adcs $acc4, $acc4, $a4]}
+    ld $a4,32($ap)
+    @{[adcs $acc5, $acc5, $a5]}
+    ld $a5,40($ap)
+    @{[adcs $acc6, $acc6, $a6]}
+    ld $a6,48($ap)
+
+    mv $np,$ap    # ap->np,reuse register'np',a[8]
+    @{[adcs $acc7, "zero", $a7]}    # t[7]...
+    ld $a7,56($ap)
+    add $ap,$ap,64    # move pointer forward by 64 bits (8 bytes),a[16]
+
+    li $carry2,-64    # cnt=-64, set loop count
+    @{[sd_cnt $carry2,$fp]}
+
+    # Compute cross products: (current 8 elements) x (remaining a)
+    # Accumulate results in RAX (accumulator)
+    #
+    #                                                         a[8]a[0]
+    #                                                     a[9]a[0]
+    #                                                 a[a]a[0]
+    #                                             a[b]a[0]
+    #                                         a[c]a[0]
+    #                                     a[d]a[0]
+    #                                 a[e]a[0]
+    #                             a[f]a[0]
+    #                                                     a[8]a[1]
+    #                         a[f]a[1]........................
+    #                                                 a[8]a[2]
+    #                     a[f]a[2]........................
+    #                                             a[8]a[3]
+    #                 a[f]a[3]........................
+    #                                         a[8]a[4]
+    #             a[f]a[4]........................
+    #                                     a[8]a[5]
+    #         a[f]a[5]........................
+    #                                 a[8]a[6]
+    #     a[f]a[6]........................
+    #                             a[8]a[7]
+    # a[f]a[7]........................
+ .Lsqr8x_mul:
+    mul $t0,$a0,$temp    # lo(a[i+8..i+15]*a[i]),..,lo(a[i+8..i+15]*a[i+7])
+    add $carry,zero,$carry1    # carry bit
+    mul $t1,$a1,$temp
+
+    @{[ld_cnt $carry2,$fp]}
+    add $carry2,$carry2,8    # cnt=cnt+8
+    @{[sd_cnt $carry2,$fp]}
+
+    mul $t2,$a2,$temp
+    mul $t3,$a3,$temp
+    @{[adds $acc0,$acc0,$t0]}
+    mul $t0,$a4,$temp
+    @{[adcs $acc1,$acc1,$t1]}
+    mul $t1,$a5,$temp
+    @{[adcs $acc2,$acc2,$t2]}
+    mul $t2,$a6,$temp
+    @{[adcs $acc3, $acc3, $t3]}
+    mul $t3,$a7,$temp
+    @{[adcs $acc4, $acc4, $t0]}
+    mulhu $t0,$a0,$temp    # hi(a[i+8..i+15]*a[i]),..,lo(a[i+8..i+15]*a[i+7])
+    @{[adcs $acc5, $acc5, $t1]}
+    mulhu $t1,$a1,$temp
+    @{[adcs $acc6, $acc6, $t2]}
+    mulhu $t2,$a2,$temp
+    @{[adcs $acc7, $acc7, $t3]}
+    mulhu $t3,$a3,$temp
+    add $carry,$carry,$carry1
+    sd $acc0,0($tp)
+    addi $tp,$tp,8    # move pointer forward by 8 bits (1 bytes)
+
+    @{[adds $acc0, $acc1, $t0]}
+    mulhu $t0,$a4,$temp
+    @{[adcs $acc1, $acc2, $t1]}
+    mulhu $t1,$a5,$temp
+    @{[adcs $acc2, $acc3, $t2]}
+    mulhu $t2,$a6,$temp
+    @{[adcs $acc3, $acc4, $t3]}
+    mulhu $t3,$a7,$temp
+
+    @{[ld_cnt $carry2,$fp]}
+    add $carry2,$np,$carry2    # carry2 = np + cnt
+    ld $temp,0($carry2)    # a[i+1],..,a[i+7],i=0
+    @{[adcs $acc4, $acc5, $t0]}
+    @{[adcs $acc5, $acc6, $t1]}
+    @{[adcs $acc6, $acc7, $t2]}
+    @{[adcs $acc7, $carry, $t3]}
+
+    # Process current 8 elements:
+    #  - If not finished: continue inner loop, .Lsqr8x_mul
+    #  - If finished: check remaining a elements
+    #     - If none: break out,jump to .Lsqr8x_break
+    #     - Else: load new data and restart inner loop, .Lsqr8x_mul
+    @{[ld_cnt $carry2,$fp]}
+    bnez $carry2,.Lsqr8x_mul
+
+    @{[ld_apend $carry2,$fp]}
+    beq $ap, $carry2, .Lsqr8x_break    # done yet?
+
+    # loads next batch of data from temporary buffer
+    ld $a0,0($tp)
+    ld $a1,8($tp)
+    ld $a2,16($tp)
+    ld $a3,24($tp)
+    ld $a4,32($tp)
+    ld $a5,40($tp)
+    ld $a6,48($tp)
+    ld $a7,56($tp)
+
+    # accumulate new data
+    # load new data, a[16..23]...
+    @{[adds $acc0, $acc0, $a0]}
+    ld $a0,0($ap)
+    ld $temp, -64($np)    # return to start of current 8 elements, a[0]、a[8]...
+    @{[adcs $acc1, $acc1, $a1]}
+    ld $a1,8($ap)
+    @{[adcs $acc2, $acc2, $a2]}
+    ld $a2,16($ap)
+    @{[adcs $acc3, $acc3, $a3]}
+    ld $a3,24($ap)
+    @{[adcs $acc4, $acc4, $a4]}
+    ld $a4,32($ap)
+    @{[adcs $acc5, $acc5, $a5]}
+    ld $a5,40($ap)
+    @{[adcs $acc6, $acc6, $a6]}
+    ld $a6,48($ap)
+    li $carry2,-64    # set loop count, cnt=-64, 8 times
+    @{[sd_cnt $carry2,$fp]}
+    @{[adcs $acc7, $acc7, $a7]}
+    ld $a7,56($ap)
+    add $ap,$ap,64    # move pointer forward by 64 bytes, a[24]...
+    j .Lsqr8x_mul
+
+    # Outer iteration completion:
+    #    - Fetch next data chunk for upcoming iteration
+    #    - Determine if this is the final iteration
+    #    - Update pointer positions and buffer offsets
+.balign 16
+.Lsqr8x_break:
+    ld $a0,0($np)
+    ld $a1,8($np)
+    add $ap,$np,64
+    ld $a2,16($np)
+    ld $a3,24($np)
+    @{[ld_apend $carry2,$fp]}
+    sub $t0,$carry2,$ap    # is it last iteration?
+    ld $a4,32($np)
+    ld $a5,40($np)
+    sub $t1,$tp,$t0
+    ld $a6,48($np)
+    ld $a7,56($np)
+    beqz $t0, .Lsqr8x_outer_loop
+
+    sd $acc0,0($tp)
+    sd $acc1,8($tp)
+    ld $acc0,0($t1)
+    ld $acc1,8($t1)
+    sd $acc2,16($tp)
+    sd $acc3,24($tp)
+    ld $acc2,16($t1)
+    ld $acc3,24($t1)
+    sd $acc4,32($tp)
+    sd $acc5,40($tp)
+    ld $acc4,32($t1)
+    ld $acc5,40($t1)
+    sd $acc6,48($tp)
+    sd $acc7,56($tp)
+    mv $tp,$t1
+    ld $acc6,48($t1)
+    ld $acc7,56($t1)
+
+    j .Lsqr8x_outer_loop
+    # Squaring algorithm:
+    # 1.Reload input
+    # 2.Compute diagonal squares, a[n-1]*a[n-1]|...|a[0]*a[0]
+    # 3.Cross terms x 2: first via lsl#1, rest via extr chain (128-bit << 1)
+    # 4.Merge diagonal + cross terms x 2
+.balign 16
+.Lsqr8x_outer_break:
+    ld $a1,0($t0)    # recall that $t0 is &a[0], load a[0 1 2 3]
+    ld $a3,8($t0)
+    ld $t1,8(sp)    # load previously computed cross product term from buffer
+    ld $t2,16(sp)
+    ld $a5,16($t0)
+    ld $a7,24($t0)
+
+    add $ap,$t0,32    # a[4]
+    ld $t3,24(sp)
+    sd $acc0,0($tp)
+    ld $t0,32(sp)
+    sd $acc1,8($tp)
+
+    mul $acc0,$a1,$a1    # lo(a[0]*a[0])
+    sd $acc2,16($tp)
+    sd $acc3,24($tp)
+    mulhu $a1,$a1,$a1    # hi(a[0]*a[0])
+    sd $acc4,32($tp)
+    sd $acc5,40($tp)
+    mul $a2,$a3,$a3    # lo(a[1]*a[1])
+    sd $acc6,48($tp)
+    sd $acc7,56($tp)
+    mv $tp,sp
+    mulhu $a3,$a3,$a3    # hi(a[1]*a[1])
+
+    slli $carry2,$t1,1    # cross terms x 2
+    @{[adds $acc1, $a1, $carry2]}    # (t1 << 1) + hi(a[0]*a[0])
+    @{[extr $t1, $t2, $t1]}    # t1 = (t2 << 1) + high(t1), cross term multiplied by 2
+
+    @{[ld_num $carry2,$fp]}
+    addi $carry2,$carry2,-32    # set loop count, cnt=num-32
+    @{[sd_cnt $carry2,$fp]}
+
+    # Loop processing 4 diagonal elements per iteration:
+    #  - Compute a[i]*a[i] for 4 elements
+    #  - Accumulate left-shifted cross product terms
+.Lsqr4x_shift_n_add:
+    @{[adcs $acc2, $a2, $t1]}    # t1+ lo(a[1]*a[1])
+    @{[extr $t2, $t3, $t2]}    # t2 = (t3 << 1) + high(t2), cross term multiplied by 2
+
+    @{[ld_cnt $carry2,$fp]}
+    addi $carry2,$carry2,-32    # cnt=cnt-32
+    @{[sd_cnt $carry2,$fp]}
+
+    @{[adcs $acc3, $a3, $t2]}    # t2+ hi(a[1]*a[1])
+    ld $t1,40($tp)
+    ld $t2,48($tp)
+    mul $a4,$a5,$a5    # lo(a[2]*a[2])
+    ld $a1,0($ap)    # a[4 5]
+    ld $a3,8($ap)
+    addi $ap,$ap,16
+    mulhu $a5,$a5,$a5    # hi(a[2]*a[2])
+    @{[extr $t3, $t0, $t3]}    # t3 = (t0 << 1) + high(t3), cross term multiplied by 2
+    sd $acc0,0($tp)
+    sd $acc1,8($tp)
+    mul $a6,$a7,$a7    # lo(a[3]*a[3])
+    mulhu $a7,$a7,$a7    # hi(a[3]*a[3])
+    @{[adcs $acc4, $a4, $t3]}    # t3 + lo(a[2]*a[2])
+    @{[extr $t0, $t1, $t0]}    # t0 = (t1 << 1) + high(t0), cross term multiplied by 2
+    sd $acc2,16($tp)
+    sd $acc3,24($tp)
+    @{[adcs $acc5, $a5, $t0]}    # t0 + hi(a[2]*a[2])
+    ld $t3,56($tp)
+    ld $t0,64($tp)
+    @{[extr $t1, $t2, $t1]}    # t1 = (t2 << 1) + high(t1), cross term multiplied by 2
+    @{[adcs $acc6, $a6, $t1]}    # t1 + lo(a[3]*a[3])
+    @{[extr $t2, $t3, $t2]}    # t2 = (t3 << 1) + high(t2), cross term multiplied by 2
+    @{[adcs $acc7, $a7, $t2]}    # t2 + hi(a[3]*a[3])
+    ld $t1,72($tp)
+    ld $t2,80($tp)
+    mul $a0,$a1,$a1    # lo(a[4]*a[4])
+    ld $a5,0($ap)    # a[6 7]
+    ld $a7,8($ap)
+    addi $ap,$ap,16    # move forward by two elements
+    mulhu $a1,$a1,$a1    # hi(a[4]*a[4])
+    sd $acc4,32($tp)
+    sd $acc5,40($tp)
+    mul $a2,$a3,$a3    # lo(a[5]*a[5])
+    mulhu $a3,$a3,$a3    # hi(a[5]*a[5])
+    @{[extr $t3, $t0, $t3]}    # t3 = (t0 << 1) + high(t3), cross term multiplied by 2
+    sd $acc6,48($tp)
+    sd $acc7,56($tp)
+    addi $tp,$tp,64
+    @{[adcs $acc0, $a0, $t3]}    # t3 + lo(a[4]*a[4])
+    @{[extr $t0, $t1, $t0]}    # t0 = (t1 << 1) + high(t0), cross term multiplied by 2
+    @{[adcs $acc1, $a1, $t0]}    # t0 + hi(a[4]*a[4])
+    ld $t3,24($tp)
+    ld $t0,32($tp)
+    @{[extr $t1, $t2, $t1]}    # t1 = (t2 << 1) + high(t1), cross term multiplied by 2
+
+    @{[ld_cnt $carry2,$fp]}
+    bnez $carry2,.Lsqr4x_shift_n_add    # if cnt!=0, jump to .Lsqr4x_shift_n_add
+___
+my ($np,$np_temp) = use_regs(11,13);
+$code.=<<___;
+    # Tail element handling for square computation
+    @{[ld_np $np,$fp]}    # load np, N
+    @{[ld_n0 $n0,$fp]} # load n0, n0`
+    @{[adcs $acc2, $a2, $t1]}    # t1 + lo(a[n-3]*a[n-3])
+    @{[extr $t2, $t3, $t2]}    # t2 = (t3 << 1) + high(t2), cross term multiplied by 2
+    @{[adcs $acc3, $a3, $t2]}    # t2 + hi(a[n-3]*a[n-3])
+    ld $t1,40($tp)
+    ld $t2,48($tp)
+    mul $a4,$a5,$a5    # lo(a[n-2]*a[n-2])
+    mulhu $a5,$a5,$a5    # hi(a[n-2]*a[n-2])
+    sd $acc0,0($tp)
+    sd $acc1,8($tp)
+    mul $a6,$a7,$a7    # lo(a[n-1]*a[n-1])
+    mulhu $a7,$a7,$a7    # hi(a[n-1]*a[n-1])
+    sd $acc2,16($tp)
+    sd $acc3,24($tp)
+    @{[extr $t3, $t0, $t3]}    # t3 = (t0 << 1) + high(t3), cross term multiplied by 2
+    @{[adcs $acc4, $a4, $t3]}    # t3 + lo(a[n-2]*a[n-2])
+    @{[extr $t0, $t1, $t0]}    # t0 = (t1 << 1) + high(t0), cross term multiplied by 2
+    ld $acc0,0(sp)
+    ld $acc1,8(sp)
+    @{[adcs $acc5, $a5, $t0]}    # t0 + hi(a[n-2]*a[n-2])
+    @{[extr $t1, $t2, $t1]}    # t1 = (t2 << 1) + high(t1), cross term multiplied by 2
+    ld $a0,0($np)    # load N[0..7]
+    ld $a1,8($np)
+    @{[adcs $acc6, $a6, $t1]}    # t1 + lo(a[n-1]*a[n-1])
+    @{[extr $t2, "zero", $t2]}    # t2 = high(t2)
+    ld $a2,16($np)
+    ld $a3,24($np)
+    @{[adc $acc7, $a7, $t2]}     # t2 + hi(a[n-1]*a[n-1])
+    ld $a4,32($np)
+    ld $a5,40($np)
+
+    # Reduce by 512 bits per iteration
+    mul $na0,$n0,$acc0    # t[0]*n0, the modular reduction coefficient
+    ld $a6,48($np)
+    ld $a7,56($np)
+    @{[ld_num $carry2,$fp]}
+    add $carry2,$np,$carry2    # np_end=np+num
+    @{[sd_npend $carry2,$fp]}
+    ld $acc2,16(sp)
+    ld $acc3,24(sp)
+    sd $acc4,32($tp)
+    sd $acc5,40($tp)
+    ld $acc4,32(sp)
+    ld $acc5,40(sp)
+    sd $acc6,48($tp)
+    sd $acc7,56($tp)
+    ld $acc6,48(sp)
+    ld $acc7,56(sp)
+    add $np,$np,64    # move pointer forward by 64 bytes, 8 elements
+    mv $tp,sp
+    li $carry2,0
+    @{[sd_topmost $carry2,$fp]} # initial topmost carry as 0
+    li $carry2,8    # set loop count, cnt=8
+    @{[sd_cnt $carry2,$fp]}
+
+    # Montgomery reduction, t = t + m * N / 2^64
+    # m = t[i] * n0 mod 2^64
+    # single-round reduction process:Lsqr8x_reduction ——>Lsqr8x_tail->Lsqr8x_tail_break
+    #                                                 |__Lsqr8x8_post_condition
+    # after one round of reduction completes,
+    # process the current m[i..i+7]*N+t until
+    # all tp in the buffer have been reduced
+ .Lsqr8x_reduction:
+    # mul $t0,$a0,$na0    # discarded
+    mul $t1,$a1,$na0    # lo(n[1-7])*lo(t[0]*n0)
+
+    @{[ld_cnt $carry2,$fp]}
+    addi $carry2,$carry2,-1    # cnt=cnt-1
+    @{[sd_cnt $carry2,$fp]}
+
+    mul $t2,$a2,$na0
+    sd $na0,0($tp)    # put aside "na0=t[0]*n0" for tail processing
+    addi $tp,$tp,8    # tp=tp+8
+    mul $t3,$a3,$na0
+
+    # low64 partial product + reduction term
+    snez $carry1, $acc0        # only carry status needed, not full lo1 result
+    mul $t0,$a4,$na0
+    @{[adcs $acc0, $acc1, $t1]}
+    mul $t1,$a5,$na0
+    @{[adcs $acc1, $acc2, $t2]}
+    mul $t2,$a6,$na0
+    @{[adcs $acc2, $acc3, $t3]}
+    mul $t3,$a7,$na0
+    @{[adcs $acc3, $acc4, $t0]}
+    mulhu $t0,$a0,$na0    # hi(n[0-7])*lo(t[0]*n0)
+    @{[adcs $acc4, $acc5, $t1]}
+    mulhu $t1,$a1,$na0
+    @{[adcs $acc5, $acc6, $t2]}
+    mulhu $t2,$a2,$na0
+    @{[adcs $acc6, $acc7, $t3]}
+    mulhu $t3,$a3,$na0
+    add $acc7,zero,$carry1
+
+    # high64 partial product + reduction term
+    @{[adds $acc0, $acc0, $t0]}
+    mulhu $t0,$a4,$na0
+    @{[adcs $acc1, $acc1, $t1]}
+    mulhu $t1,$a5,$na0
+    @{[adcs $acc2, $acc2, $t2]}
+    mulhu $t2,$a6,$na0
+    @{[adcs $acc3, $acc3, $t3]}
+    mulhu $t3,$a7,$na0
+    mul $na0,$n0,$acc0    # next na0
+    @{[adcs $acc4, $acc4, $t0]}
+    @{[adcs $acc5, $acc5, $t1]}
+    @{[adcs $acc6, $acc6, $t2]}
+    @{[adc  $acc7, $acc7, $t3]}
+
+    @{[ld_cnt $carry2,$fp]}
+    bnez $carry2,.Lsqr8x_reduction    # 8 iteration done?
+
+    ld $t0,0($tp)
+    ld $t1,8($tp)
+    ld $t2,16($tp)
+    ld $t3,24($tp)
+    mv $np_temp,$tp
+    @{[ld_npend $carry2,$fp]}
+    sub $carry2,$carry2,$np    # cnt=np_end-np, done yet?
+    @{[sd_cnt $carry2,$fp]}
+
+    @{[adds $acc0, $acc0, $t0]}
+    @{[adcs $acc1, $acc1, $t1]}
+    ld $t0,32($tp)
+    ld $t1,40($tp)
+    @{[adcs $acc2, $acc2, $t2]}
+    @{[adcs $acc3, $acc3, $t3]}
+    ld $t2,48($tp)
+    ld $t3,56($tp)
+    @{[adcs $acc4, $acc4, $t0]}
+    @{[adcs $acc5, $acc5, $t1]}
+    @{[adcs $acc6, $acc6, $t2]}
+    @{[adcs $acc7, $acc7, $t3]}
+
+    # check whether N has finished iteration.
+    # If completed, proceed to the special scenario '8' for processing;
+    # otherwise, continue with the reduction Lsqr8x_tail.
+    @{[ld_cnt $carry2,$fp]}
+    beqz $carry2,.Lsqr8x8_post_condition
+
+    ld $n0,-64($tp)    # load the previous na0
+    ld $a0,0($np)    # load next N[0..7]
+    ld $a1,8($np)
+    ld $a2,16($np)
+    ld $a3,24($np)
+    ld $a4,32($np)
+    ld $a5,40($np)
+    li $carry2,-64
+    @{[sd_cnt $carry2,$fp]}  # set loop cnt,cnt=-64
+    ld $a6,48($np)
+    ld $a7,56($np)
+    add $np,$np,64    # move pointer forward by next 8 elements
+
+.Lsqr8x_tail:
+    mul $t0,$a0,$n0
+    add $carry,zero,$carry1    # carry bit, modulo-scheduled
+    mul $t1,$a1,$n0
+
+    @{[ld_cnt $carry2,$fp]}
+    addi $carry2,$carry2,8    # cnt=cnt+8
+    @{[sd_cnt $carry2,$fp]}
+
+    mul $t2,$a2,$n0
+    mul $t3,$a3,$n0
+    # low64 partial product + reduction term
+    @{[adds $acc0, $acc0, $t0]}
+    mul $t0,$a4,$n0
+    @{[adcs $acc1, $acc1, $t1]}
+    mul $t1,$a5,$n0
+    @{[adcs $acc2, $acc2, $t2]}
+    mul $t2,$a6,$n0
+    @{[adcs $acc3, $acc3, $t3]}
+    mul $t3,$a7,$n0
+    @{[adcs $acc4, $acc4, $t0]}
+    mulhu $t0,$a0,$n0
+    @{[adcs $acc5, $acc5, $t1]}
+    mulhu $t1,$a1,$n0
+    @{[adcs $acc6, $acc6, $t2]}
+    mulhu $t2,$a2,$n0
+    @{[adcs $acc7, $acc7, $t3]}
+    mulhu $t3,$a3,$n0
+    add $carry,$carry,$carry1
+    sd $acc0,0($tp)
+    addi $tp,$tp,8    # move pointer forward by next 8 elements
+
+    # high64 partial product + reduction term
+    @{[adds $acc0, $acc1, $t0]}
+    mulhu $t0,$a4,$n0
+    @{[adcs $acc1, $acc2, $t1]}
+    mulhu $t1,$a5,$n0
+    @{[adcs $acc2, $acc3, $t2]}
+    mulhu $t2,$a6,$n0
+    @{[adcs $acc3, $acc4, $t3]}
+    mulhu $t3,$a7,$n0
+
+    @{[ld_cnt $carry2,$fp]}
+    add $carry2,$np_temp,$carry2
+    ld $n0,0($carry2)    # next n0, ld 0(np_temp+cnt)
+
+    @{[adcs $acc4, $acc5, $t0]}
+    @{[adcs $acc5, $acc6, $t1]}
+    @{[adcs $acc6, $acc7, $t2]}
+    @{[adcs $acc7, $carry, $t3]}
+    @{[ld_cnt $carry2,$fp]}
+    bnez $carry2,.Lsqr8x_tail    # 8 iteration done?
+
+    ld $a0,0($tp)
+    ld $a1,8($tp)
+    @{[ld_npend $carry2,$fp]}
+    sub $carry2,$carry2,$np    # done yet? cnt=np_end-np
+    @{[sd_cnt $carry2,$fp]}
+
+    @{[ld_npend $t0,$fp]}
+    @{[ld_num $t1,$fp]}
+    sub $t2, $t0, $t1    # rewinded np, t2=np_end-num
+    ld $a2,16($tp)
+    ld $a3,24($tp)
+    ld $a4,32($tp)
+    ld $a5,40($tp)
+    ld $a6,48($tp)
+    ld $a7,56($tp)
+
+    @{[ld_cnt $carry2,$fp]}
+    beqz $carry2,.Lsqr8x_tail_break    # exit loop when np=np_end
+
+    ld $n0,-64($np_temp)    # load the previous n0 value
+    @{[adds $acc0, $acc0, $a0]}
+    @{[adcs $acc1, $acc1, $a1]}
+    ld $a0,0($np)    # next N[0..7]
+    ld $a1,8($np)
+    @{[adcs $acc2, $acc2, $a2]}
+    @{[adcs $acc3, $acc3, $a3]}
+    ld $a2,16($np)
+    ld $a3,24($np)
+    @{[adcs $acc4, $acc4, $a4]}
+    @{[adcs $acc5, $acc5, $a5]}
+    ld $a4,32($np)
+    ld $a5,40($np)
+    @{[adcs $acc6, $acc6, $a6]}
+
+    li $carry2,-64    # set loop cnt,cnt=-64
+    @{[sd_cnt $carry2,$fp]}
+
+    @{[adcs $acc7, $acc7, $a7]}
+    ld $a6,48($np)
+    ld $a7,56($np)
+    add $np,$np,64
+    j .Lsqr8x_tail
+
+.balign 16
+.Lsqr8x_tail_break:
+    @{[ld_n0 $n0,$fp]}    # load n0
+    addi $carry2,$tp,64    # cnt=cnt+64
+    @{[sd_cnt $carry2,$fp]}
+    @{[ld_topmost $carry2,$fp]} # load topmost
+
+    snez $carry1, $carry2    # "move" topmost carry to carry bit
+    @{[adcs $t0, $acc0, $a0]}
+    @{[adcs $t1, $acc1, $a1]}
+    ld $acc0,0($np_temp)    # load the current t[0..7] from tp
+    ld $acc1,8($np_temp)
+    @{[adcs $acc2, $acc2, $a2]}
+    ld $a0,0($t2)    # recall that $t2 is &n[0], initialize, load N[0..7]
+    ld $a1,8($t2)
+    @{[adcs $acc3, $acc3, $a3]}
+    ld $a2,16($t2)
+    ld $a3,24($t2)
+    @{[adcs $acc4, $acc4, $a4]}
+    @{[adcs $acc5, $acc5, $a5]}
+    ld $a4,32($t2)
+    ld $a5,40($t2)
+    @{[adcs $acc6, $acc6, $a6]}
+    @{[adcs $acc7, $acc7, $a7]}
+    ld $a6,48($t2)
+    ld $a7,56($t2)
+    addi $np,$t2,64    # np=np+64, move pointer forward by next 8 elements
+
+    @{[ld_topmost $carry2,$fp]}
+    add $carry2,zero,$carry1    # topmost carry
+    @{[sd_topmost $carry2,$fp]} # offload topmost
+
+    mul $na0,$n0,$acc0    # na0=n0*t[0]
+    sd $t0,0($tp)
+    sd $t1,8($tp)
+    sd $acc2,16($tp)
+    sd $acc3,24($tp)
+    ld $acc2,16($np_temp)
+    ld $acc3,24($np_temp)
+    sd $acc4,32($tp)
+    sd $acc5,40($tp)
+    ld $acc4,32($np_temp)
+    ld $acc5,40($np_temp)
+    @{[ld_cnt $t1,$fp]}    # load cnt, t1<-cnt
+    sd $acc6,48($tp)
+    sd $acc7,56($tp)
+    mv $tp,$np_temp    # update tp<-np_temp
+    ld $acc6,48($np_temp)
+    ld $acc7,56($np_temp)
+    li $carry2,8    #  set loop cnt, cnt=8
+    @{[sd_cnt $carry2,$fp]}
+    bne $t1,$fp, .Lsqr8x_reduction    # if t1!=fp,jump to the next round of reduction and
+                                      # continue with the reduction of the next set of na0.
+
+    @{[sd_n0 $n0,$fp]}    # offload n0
+    @{[ld_rp $carry,$fp]}
+    add $tp,$tp,64
+    @{[subs  $t0, $acc0, $a0]}
+    @{[sbcs  $t1, $acc1, $a1]}
+    @{[ld_num $t2,$fp]}
+    addi $t3,$t2,-64    # cnt=num-64
+    @{[sd_cnt $t3,$fp]}
+    mv $carry2,$carry    # ap_end=rp, rp copy
+    @{[sd_apend $carry2,$fp]}
+
+    # conditional subtraction, compute T-N
+    # If T < N (borrow occurs), set t = T;
+    # If T >= N (no borrow), set t = T-N
+    # Process 8 elements per loop iteration.
+.Lsqr8x_sub:
+    @{[sbcs  $t2, $acc2, $a2]}    # t=t-N
+    ld $a0,0($np)
+    ld $a1,8($np)
+    @{[sbcs  $t3, $acc3, $a3]}
+    sd $t0,0($carry)    # sd result
+    sd $t1,8($carry)
+    @{[sbcs  $t0, $acc4, $a4]}
+    ld $a2,16($np)
+    ld $a3,24($np)
+    @{[sbcs  $t1, $acc5, $a5]}
+    sd $t2,16($carry)
+    sd $t3,24($carry)
+    @{[sbcs  $t2, $acc6, $a6]}
+    ld $a4,32($np)
+    ld $a5,40($np)
+    @{[sbcs  $t3, $acc7, $a7]}
+    ld $a6,48($np)
+    ld $a7,56($np)
+    add $np,$np,64
+    ld $acc0,0($tp)    # ld t[0..7]
+    ld $acc1,8($tp)
+
+    @{[ld_cnt $carry2,$fp]}
+    addi $carry2,$carry2,-64    # cnt=cnt-64
+    @{[sd_cnt $carry2,$fp]}
+
+    ld $acc2,16($tp)
+    ld $acc3,24($tp)
+    ld $acc4,32($tp)
+    ld $acc5,40($tp)
+    ld $acc6,48($tp)
+    ld $acc7,56($tp)
+    add $tp,$tp,64    # move pointer forward by next 8 elements
+    sd $t0,32($carry)
+    sd $t1,40($carry)
+    @{[sbcs  $t0, $acc0, $a0]}
+    sd $t2,48($carry)
+    sd $t3,56($carry)
+    add $carry,$carry,64
+    @{[sbcs  $t1, $acc1, $a1]}
+
+    @{[ld_cnt $carry2,$fp]}
+    bnez $carry2,.Lsqr8x_sub    # # if cnt!=0, jump to Lsqr8x_sub
+
+    @{[sbcs  $t2, $acc2, $a2]}
+    mv $tp,sp
+    @{[ld_num $carry2,$fp]}
+    add $ap,sp,$carry2    # ap=sp-num
+
+    @{[ld_apend $carry2,$fp]}
+    ld $a0,0($carry2)    # ld 0(ap_end)
+    ld $a1,8($carry2)
+
+    @{[sbcs  $t3, $acc3, $a3]}
+    sd $t0,0($carry)
+    sd $t1,8($carry)
+    @{[sbcs  $t0, $acc4, $a4]}
+    @{[ld_apend $carry2,$fp]}
+    ld $a2,16($carry2)
+    ld $a3,24($carry2)
+    @{[sbcs  $t1, $acc5, $a5]}
+    sd $t2,16($carry)
+    sd $t3,24($carry)
+    @{[sbcs  $t2, $acc6, $a6]}
+    ld $acc0,0($ap)
+    ld $acc1,8($ap)
+    @{[sbcs  $t3, $acc7, $a7]}
+    ld $acc2,16($ap)
+    ld $acc3,24($ap)
+
+    @{[ld_topmost $temp,$fp]}
+    sub $carry2, $temp, $carry1    # did it borrow?
+    sltu $carry1, $temp, $carry2
+    xori $carry1, $carry1, 1
+
+    sd $t0,32($carry)
+    sd $t1,40($carry)
+    sd $t2,48($carry)
+    sd $t3,56($carry)
+
+    @{[ld_num $carry2,$fp]}
+    addi $carry2,$carry2,-32    # cnt=num-32
+    @{[sd_cnt $carry2,$fp]}
+    @{[ld_ra $ra,$fp]}    # offload ra
+
+
+    # The csel conditional instruction selects the result based on the borrow flag.
+    # If there is a borrow (C=0), select the original value acc
+    # If there is no borrow (C=1), select the subtracted value a
+    # Process 4 elements per loop iteration.
+.Lsqr4x_cond_copy:
+    @{[ld_cnt $carry2,$fp]}
+    addi $carry2,$carry2,-32    # cnt=cnt_pre-32=num-32-32
+    @{[sd_cnt $carry2,$fp]}
+
+    @{[csel  $t0, $acc0, $a0]}    # when pre_borrow occurs, t0 = acc0
+    sd zero,0($tp)
+    sd zero,8($tp)
+    @{[csel  $t1, $acc1, $a1]}
+    @{[ld_apend $carry2,$fp]}
+    ld $a0,32($carry2)
+    ld $a1,40($carry2)
+    ld $acc0,32($ap)
+    ld $acc1,40($ap)
+    @{[csel  $t2, $acc2, $a2]}
+    sd zero,16($tp)
+    sd zero,24($tp)
+    add $tp,$tp,32
+    @{[csel  $t3, $acc3, $a3]}
+    ld $a2,48($carry2)
+    ld $a3,56($carry2)
+    ld $acc2,48($ap)
+    ld $acc3,56($ap)
+    add $ap,$ap,32
+    sd $t0,0($carry2)
+    sd $t1,8($carry2)
+    sd $t2,16($carry2)
+    sd $t3,24($carry2)
+    add $carry2,$carry2,32    # ap_end=ap_end+32
+    @{[sd_apend $carry2,$fp]}
+    sd zero,0($ap)
+    sd zero,8($ap)
+    sd zero,16($ap)
+    sd zero,24($ap)
+
+    @{[ld_cnt $carry2,$fp]}
+    bnez $carry2,.Lsqr4x_cond_copy    # if cnt!=0, jump to Lsqr4x_cond_copy
+
+    @{[csel  $t0, $acc0, $a0]}
+    sd zero,0($tp)
+    sd zero,8($tp)
+    @{[csel  $t1, $acc1, $a1]}
+    sd zero,16($tp)
+    sd zero,24($tp)
+    @{[csel  $t2, $acc2, $a2]}
+    @{[csel  $t3, $acc3, $a3]}
+    @{[ld_apend $carry2,$fp]}
+    sd $t0,0($carry2)
+    sd $t1,8($carry2)
+    sd $t2,16($carry2)
+    sd $t3,24($carry2)
+    j .Lsqr8x_done
+
+# process special case for 8-word boundary
+.balign 16
+.Lsqr8x8_post_condition:
+    add $carry,zero,$carry1
+    @{[subs  $a0, $acc0, $a0]}    # acc0-7,carry hold result, a0-7 hold modulus
+    @{[ld_rp $ap,$fp]}    # pull rp
+    @{[sbcs  $a1, $acc1, $a1]}
+    sd zero,0(sp)
+    sd zero,8(sp)
+    @{[sbcs  $a2, $acc2, $a2]}
+    sd zero,16(sp)
+    sd zero,24(sp)
+    @{[sbcs  $a3, $acc3, $a3]}
+    sd zero,32(sp)
+    sd zero,40(sp)
+    @{[sbcs  $a4, $acc4, $a4]}
+    sd zero,48(sp)
+    sd zero,56(sp)
+    @{[sbcs  $a5, $acc5, $a5]}
+    sd zero,64(sp)
+    sd zero,72(sp)
+    @{[sbcs  $a6, $acc6, $a6]}
+    sd zero,80(sp)
+    sd zero,88(sp)
+    @{[sbcs  $a7, $acc7, $a7]}
+    sd zero,96(sp)
+    sd zero,104(sp)
+    @{[sbcs  $carry, $carry, "zero"]}    # did it borrow?
+    xori $carry1, $carry1, 1
+    sd zero,112(sp)
+    sd zero,120(sp)
+
+    # a0-7 hold result-modulus
+    @{[csel  $a0, $acc0, $a0]}
+    @{[csel  $a1, $acc1, $a1]}
+    @{[csel  $a2, $acc2, $a2]}
+    @{[csel  $a3, $acc3, $a3]}
+    sd $a0,0($ap)
+    sd $a1,8($ap)
+    @{[csel  $a4, $acc4, $a4]}
+    @{[csel  $a5, $acc5, $a5]}
+    sd $a2,16($ap)
+    sd $a3,24($ap)
+    @{[csel  $a6, $acc6, $a6]}
+    @{[csel  $a7, $acc7, $a7]}
+    sd $a4,32($ap)
+    sd $a5,40($ap)
+    sd $a6,48($ap)
+    sd $a7,56($ap)
+
+    @{[ld_ra $ra,$fp]}    # pull ra return address
+.Lsqr8x_done:
+    mv sp, $fp
+    li $rp, 1
+___
+$code .= <<___;
+    ld      s0,88(sp)
+    ld      s1,80(sp)
+    ld      s2,72(sp)
+    ld      s3,64(sp)
+    ld      s4,56(sp)
+    ld      s5,48(sp)
+    ld      s6,40(sp)
+    ld      s7,32(sp)
+    ld      s8,24(sp)
+    ld      s9,16(sp)
+    ld      s11,8(sp)
+    ld      s10,0(sp)
+    addi    sp,sp,168
+    ret
+.size bn_sqr8x_mont,.-bn_sqr8x_mont
+___
+}
 print $code;
 close STDOUT or die "error closing STDOUT: $!";