Commit a81b4d920b for openssl.org
commit a81b4d920b28b4958d7fdc45648b5d778cc60df0
Author: zhoulu <zhou.lu1@zte.com.cn>
Date: Sat May 16 15:08:34 2026 +0800
Further improve the decryption performance of AES-128-CBC on the RISC-V architecture
The decryption performance of AES-128-CBC is improved by 6% to 15%, with the main optimizations as follows:
1.The block processing mode is adjusted to single-block loop + 4-block loop + 8-block loop.
2.The backup of ciphertext using vmv_v_v for XOR operations is replaced with reloading using vle32_v.
3.Key loading and decryption computation are interleaved in a loop.
Hardware simulation environment verification data:
| Decrypt Test | Baseline | Optimized | Improvement ratio |
| ------------ | --------------- | ------------- | ----------------- |
| 16 bytes | 14357.22k | 15271.90k | 6% |
| 64 bytes | 29176.38k | 33592.29k | 15% |
| 256 bytes | 38664.19k | 42968.09k | 11% |
| 1024 bytes | 40308.09k | 43875.04k | 9% |
| 8192 bytes | 42811.39k | 46032.08k | 8% |
| 16384 bytes | 42809.28k | 46110.04k | 8% |
Reviewed-by: Neil Horman <nhorman@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.foundation>
MergeDate: Thu Jun 18 12:22:19 2026
(Merged from https://github.com/openssl/openssl/pull/31116)
diff --git a/crypto/aes/asm/aes-riscv64-zvkned.pl b/crypto/aes/asm/aes-riscv64-zvkned.pl
index 615542b46f..45c2efde07 100644
--- a/crypto/aes/asm/aes-riscv64-zvkned.pl
+++ b/crypto/aes/asm/aes-riscv64-zvkned.pl
@@ -210,88 +210,6 @@ ___
return $code;
}
-# aes-128 decryption with round keys v1-v11
-sub aes_128_decrypt_6 {
- my $code=<<___;
- @{[vaesz_vs $V24, $V11]} # with round key w[40,43]
- @{[vaesz_vs $V25, $V11]} # with round key w[40,43]
- @{[vaesz_vs $V26, $V11]} # with round key w[40,43]
- @{[vaesz_vs $V27, $V11]} # with round key w[40,43]
- @{[vaesz_vs $V28, $V11]} # with round key w[40,43]
- @{[vaesz_vs $V29, $V11]} # with round key w[40,43]
- @{[vaesdm_vs $V24, $V10]} # with round key w[36,39]
- @{[vaesdm_vs $V25, $V10]} # with round key w[36,39]
- @{[vaesdm_vs $V26, $V10]} # with round key w[36,39]
- @{[vaesdm_vs $V27, $V10]} # with round key w[36,39]
- @{[vaesdm_vs $V28, $V10]} # with round key w[36,39]
- @{[vaesdm_vs $V29, $V10]} # with round key w[36,39]
- @{[vaesdm_vs $V24, $V9]} # with round key w[32,35]
- @{[vaesdm_vs $V25, $V9]} # with round key w[32,35]
- @{[vaesdm_vs $V26, $V9]} # with round key w[32,35]
- @{[vaesdm_vs $V27, $V9]} # with round key w[32,35]
- @{[vaesdm_vs $V28, $V9]} # with round key w[32,35]
- @{[vaesdm_vs $V29, $V9]} # with round key w[32,35]
-
- @{[vaesdm_vs $V24, $V8]} # with round key w[28,31]
- @{[vaesdm_vs $V25, $V8]} # with round key w[28,31]
- @{[vaesdm_vs $V26, $V8]} # with round key w[28,31]
- @{[vaesdm_vs $V27, $V8]} # with round key w[28,31]
- @{[vaesdm_vs $V28, $V8]} # with round key w[28,31]
- @{[vaesdm_vs $V29, $V8]} # with round key w[28,31]
-
- @{[vaesdm_vs $V24, $V7]} # with round key w[24,27]
- @{[vaesdm_vs $V25, $V7]} # with round key w[24,27]
- @{[vaesdm_vs $V26, $V7]} # with round key w[24,27]
- @{[vaesdm_vs $V27, $V7]} # with round key w[24,27]
- @{[vaesdm_vs $V28, $V7]} # with round key w[24,27]
- @{[vaesdm_vs $V29, $V7]} # with round key w[24,27]
-
- @{[vaesdm_vs $V24, $V6]} # with round key w[20,23]
- @{[vaesdm_vs $V25, $V6]} # with round key w[20,23]
- @{[vaesdm_vs $V26, $V6]} # with round key w[20,23]
- @{[vaesdm_vs $V27, $V6]} # with round key w[20,23]
- @{[vaesdm_vs $V28, $V6]} # with round key w[20,23]
- @{[vaesdm_vs $V29, $V6]} # with round key w[20,23]
-
- @{[vaesdm_vs $V24, $V5]} # with round key w[16,19]
- @{[vaesdm_vs $V25, $V5]} # with round key w[16,19]
- @{[vaesdm_vs $V26, $V5]} # with round key w[16,19]
- @{[vaesdm_vs $V27, $V5]} # with round key w[16,19]
- @{[vaesdm_vs $V28, $V5]} # with round key w[16,19]
- @{[vaesdm_vs $V29, $V5]} # with round key w[16,19]
-
- @{[vaesdm_vs $V24, $V4]} # with round key w[12,15]
- @{[vaesdm_vs $V25, $V4]} # with round key w[12,15]
- @{[vaesdm_vs $V26, $V4]} # with round key w[12,15]
- @{[vaesdm_vs $V27, $V4]} # with round key w[12,15]
- @{[vaesdm_vs $V28, $V4]} # with round key w[12,15]
- @{[vaesdm_vs $V29, $V4]} # with round key w[12,15]
-
- @{[vaesdm_vs $V24, $V3]} # with round key w[ 8,11]
- @{[vaesdm_vs $V25, $V3]} # with round key w[ 8,11]
- @{[vaesdm_vs $V26, $V3]} # with round key w[ 8,11]
- @{[vaesdm_vs $V27, $V3]} # with round key w[ 8,11]
- @{[vaesdm_vs $V28, $V3]} # with round key w[ 8,11]
- @{[vaesdm_vs $V29, $V3]} # with round key w[ 8,11]
-
- @{[vaesdm_vs $V24, $V2]} # with round key w[ 4, 7]
- @{[vaesdm_vs $V25, $V2]} # with round key w[ 4, 7]
- @{[vaesdm_vs $V26, $V2]} # with round key w[ 4, 7]
- @{[vaesdm_vs $V27, $V2]} # with round key w[ 4, 7]
- @{[vaesdm_vs $V28, $V2]} # with round key w[ 4, 7]
- @{[vaesdm_vs $V29, $V2]} # with round key w[ 4, 7]
-
- @{[vaesdf_vs $V24, $V1]} # with round key w[ 0, 3]
- @{[vaesdf_vs $V25, $V1]} # with round key w[ 0, 3]
- @{[vaesdf_vs $V26, $V1]} # with round key w[ 0, 3]
- @{[vaesdf_vs $V27, $V1]} # with round key w[ 0, 3]
- @{[vaesdf_vs $V28, $V1]} # with round key w[ 0, 3]
- @{[vaesdf_vs $V29, $V1]} # with round key w[ 0, 3]
-___
-
- return $code;
-}
-
# aes-192 encryption with round keys v1-v13
sub aes_192_encrypt {
my $code=<<___;
@@ -557,16 +475,161 @@ ___
$code .= <<___;
.p2align 3
L_cbc_dec_128:
- # Load all 11 round keys to v1-v11 registers.
- @{[aes_128_load_key $KEYP]}
-
+ @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
# Load IV.
@{[vle32_v $V16, $IVP]}
- li $T1, 96
+.Lcbc_dec_loop:
+ li $T0, 10
+ addi $KEYP, $KEYP, 160
+ @{[vle32_v $V11, $KEYP]}
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V10, $KEYP]}
+ li $T1, 64
+ bgeu $LEN, $T1, .Lcbc_check_64
+
+ @{[vle32_v $V24, $INP]}
+ @{[vmv_v_v $V17, $V24]}
+ j 2f
+
+1:
+ li $T0, 10
+ addi $KEYP, $KEYP, 160
+ @{[vle32_v $V11, $KEYP]}
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V10, $KEYP]}
+ @{[vle32_v $V24, $INP]}
+ @{[vmv_v_v $V17, $V24]}
+ addi $OUTP, $OUTP, 16
+
+2:
+ # AES body
+ @{[vaesz_vs $V24, $V11]} # with round key w[40,43]
+ addi $T0, $T0, -2
3:
- blt $LEN, $T1, L_small
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V11, $KEYP]}
+ @{[vaesdm_vs $V24, $V10]} # with round key w[36,39]
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V10, $KEYP]}
+ @{[vaesdm_vs $V24, $V11]} # with round key w[32,35]
+ addi $T0, $T0, -2
+ bnez $T0, 3b
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V11, $KEYP]}
+ @{[vaesdm_vs $V24, $V10]} # with round key w[ 4, 7]
+ @{[vaesdf_vs $V24, $V11]} # with round key w[ 0, 3]
+
+ @{[vxor_vv $V24, $V24, $V16]}
+ @{[vse32_v $V24, $OUTP]}
+ @{[vmv_v_v $V16, $V17]}
+
+ addi $LEN, $LEN, -16
+ addi $INP, $INP, 16
+
+ bnez $LEN, 1b
+
+ @{[vse32_v $V16, $IVP]}
+
+ ret
+
+# =====================================================
+# If data 128bytes > length ≥ 64 bytes, process in batches of 4 blocks:
+# 4-block CBC decryption process:
+# 1. Load 4 ciphertext blocks
+# 2. Back up the ciphertext blocks
+# 3. Decrypt each data block
+# 4. Reload ciphertext blocks into registers v17-v20 for XOR
+# 5. XOR with previous ciphertext block (CBC chain)
+# 6. Update IV and store plaintext
+# If the data length is less than 64 bytes, process it block by block using the Lcbc_dec_loop function
+# =====================================================
+.Lcbc_check_64:
+ li $T1, 128
+ bgeu $LEN, $T1, .Lcbc_check_128
+ @{[vle32_v $V24, $INP]}
+ addi $INP, $INP, 16
+ @{[vle32_v $V25, $INP]}
+ addi $INP, $INP, 16
+ @{[vle32_v $V26, $INP]}
+ addi $INP, $INP, 16
+ @{[vle32_v $V27, $INP]}
+ @{[vle32_v $V20, $INP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V19, $INP]}
+
+ @{[vaesz_vs $V24, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V25, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V26, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V27, $V11]} # with round key w[40,43]
+ addi $T0, $T0, -2
+4:
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V11, $KEYP]}
+ @{[vaesdm_vs $V24, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V25, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V26, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V27, $V10]} # with round key w[36,39]
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V10, $KEYP]}
+ @{[vaesdm_vs $V24, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V25, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V26, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V27, $V11]} # with round key w[32,35]
+ addi $T0, $T0, -2
+ bnez $T0, 4b
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V11, $KEYP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V18, $INP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V17, $INP]}
+
+ @{[vaesdm_vs $V24, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V25, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V26, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V27, $V10]} # with round key w[ 4, 7]
+
+ @{[vaesdf_vs $V24, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V25, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V26, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V27, $V11]} # with round key w[ 0, 3]
+
+ @{[vxor_vv $V24, $V24, $V16]}
+ @{[vxor_vv $V25, $V25, $V17]}
+ @{[vxor_vv $V26, $V26, $V18]}
+ @{[vxor_vv $V27, $V27, $V19]}
+
+ @{[vse32_v $V24, $OUTP]}
+ addi $OUTP, $OUTP, 16
+ @{[vse32_v $V25, $OUTP]}
+ addi $OUTP, $OUTP, 16
+ @{[vse32_v $V26, $OUTP]}
+ addi $OUTP, $OUTP, 16
+ @{[vse32_v $V27, $OUTP]}
+ addi $OUTP, $OUTP, 16
+
+ @{[vmv_v_v $V16, $V20]}
+
+ addi $LEN, $LEN, -64
+ addi $INP, $INP, 64
+ bnez $LEN, .Lcbc_dec_loop
+ @{[vse32_v $V16, $IVP]}
+
+ ret
+
+# =====================================================
+# If data length ≥ 128 bytes, process 8 blocks in batch:
+# 8-block CBC decryption pipeline:
+# 1. Load 8 ciphertext blocks
+# 2. Back up the ciphertext blocks
+# 3. Decrypt each data block
+# 4. Reload ciphertext blocks into registers v17-v23 and v15 for XOR
+# 5. XOR with previous ciphertext block (CBC chain)
+# 6. Update IV and store plaintext
+# =====================================================
+.Lcbc_check_128:
@{[vle32_v $V24, $INP]}
addi $INP, $INP, 16
@{[vle32_v $V25, $INP]}
@@ -579,67 +642,108 @@ L_cbc_dec_128:
addi $INP, $INP, 16
@{[vle32_v $V29, $INP]}
addi $INP, $INP, 16
- @{[vmv_v_v $V17, $V24]}
- @{[vmv_v_v $V18, $V25]}
- @{[vmv_v_v $V19, $V26]}
- @{[vmv_v_v $V20, $V27]}
- @{[vmv_v_v $V21, $V28]}
- @{[vmv_v_v $V22, $V29]}
+ @{[vle32_v $V30, $INP]}
+ addi $INP, $INP, 16
+ @{[vle32_v $V31, $INP]}
+ @{[vle32_v $V15, $INP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V23, $INP]}
- @{[aes_128_decrypt_6]}
+ @{[vaesz_vs $V24, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V25, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V26, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V27, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V28, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V29, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V30, $V11]} # with round key w[40,43]
+ @{[vaesz_vs $V31, $V11]} # with round key w[40,43]
+ addi $INP, $INP, -16
+ @{[vle32_v $V22, $INP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V21, $INP]}
+ addi $T0, $T0, -2
+4:
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V11, $KEYP]}
+ @{[vaesdm_vs $V24, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V25, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V26, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V27, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V28, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V29, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V30, $V10]} # with round key w[36,39]
+ @{[vaesdm_vs $V31, $V10]} # with round key w[36,39]
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V10, $KEYP]}
+ @{[vaesdm_vs $V24, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V25, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V26, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V27, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V28, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V29, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V30, $V11]} # with round key w[32,35]
+ @{[vaesdm_vs $V31, $V11]} # with round key w[32,35]
+ addi $T0, $T0, -2
+ bnez $T0, 4b
+ addi $KEYP, $KEYP, -16
+ @{[vle32_v $V11, $KEYP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V20, $INP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V19, $INP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V18, $INP]}
+ addi $INP, $INP, -16
+ @{[vle32_v $V17, $INP]}
+ @{[vaesdm_vs $V24, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V25, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V26, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V27, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V28, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V29, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V30, $V10]} # with round key w[ 4, 7]
+ @{[vaesdm_vs $V31, $V10]} # with round key w[ 4, 7]
+
+ @{[vaesdf_vs $V24, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V25, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V26, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V27, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V28, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V29, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V30, $V11]} # with round key w[ 0, 3]
+ @{[vaesdf_vs $V31, $V11]} # with round key w[ 0, 3]
- @{[vxor_vv $V24, $V24, $V16]}
+ @{[vxor_vv $V24, $V24, $V16]}
@{[vxor_vv $V25, $V25, $V17]}
@{[vxor_vv $V26, $V26, $V18]}
@{[vxor_vv $V27, $V27, $V19]}
@{[vxor_vv $V28, $V28, $V20]}
@{[vxor_vv $V29, $V29, $V21]}
+ @{[vxor_vv $V30, $V30, $V22]}
+ @{[vxor_vv $V31, $V31, $V23]}
@{[vse32_v $V24, $OUTP]}
- addi $OUTP, $OUTP, 16
+ addi $OUTP, $OUTP, 16
@{[vse32_v $V25, $OUTP]}
- addi $OUTP, $OUTP, 16
+ addi $OUTP, $OUTP, 16
@{[vse32_v $V26, $OUTP]}
addi $OUTP, $OUTP, 16
@{[vse32_v $V27, $OUTP]}
- addi $OUTP, $OUTP, 16
+ addi $OUTP, $OUTP, 16
@{[vse32_v $V28, $OUTP]}
addi $OUTP, $OUTP, 16
@{[vse32_v $V29, $OUTP]}
addi $OUTP, $OUTP, 16
-
- @{[vmv_v_v $V16, $V22]}
-
- addi $LEN, $LEN, -96
-
- bnez $LEN, 3b
- @{[vse32_v $V16, $IVP]}
-
- ret
-
-L_small:
- @{[vle32_v $V24, $INP]}
- @{[vmv_v_v $V17, $V24]}
- j 2f
-
-1:
- @{[vle32_v $V24, $INP]}
- @{[vmv_v_v $V17, $V24]}
+ @{[vse32_v $V30, $OUTP]}
+ addi $OUTP, $OUTP, 16
+ @{[vse32_v $V31, $OUTP]}
addi $OUTP, $OUTP, 16
-2:
- # AES body
- @{[aes_128_decrypt]}
-
- @{[vxor_vv $V24, $V24, $V16]}
- @{[vse32_v $V24, $OUTP]}
- @{[vmv_v_v $V16, $V17]}
-
- addi $LEN, $LEN, -16
- addi $INP, $INP, 16
-
- bnez $LEN, 1b
+ @{[vmv_v_v $V16, $V15]}
+ addi $LEN, $LEN, -128
+ addi $INP, $INP, 128
+ bnez $LEN, .Lcbc_dec_loop
@{[vse32_v $V16, $IVP]}
ret