Commit c90b7dddf2 for openssl.org

commit c90b7dddf25f44c3ed27a69ae6bc67cfdb4894cd
Author: zhoulu <zhou.lu1@zte.com.cn>
Date:   Tue Dec 2 16:34:51 2025 +0800

    Performance Optimization of SM4-CBC Encryption and Decryption with Assembly on RISC-V Architecture

    Reviewed-by: Tomas Mraz <tomas@openssl.org>
    Reviewed-by: Paul Dale <paul.dale@oracle.com>
    (Merged from https://github.com/openssl/openssl/pull/29137)

diff --git a/crypto/sm4/asm/sm4-riscv64-zvksed.pl b/crypto/sm4/asm/sm4-riscv64-zvksed.pl
index ad8585efc7..66fd127aed 100644
--- a/crypto/sm4/asm/sm4-riscv64-zvksed.pl
+++ b/crypto/sm4/asm/sm4-riscv64-zvksed.pl
@@ -59,6 +59,548 @@ my $code=<<___;
 .text
 ___

+my $BLOCK_SIZE = 16;
+my $STRIDE = -4;  # Used for reversing word order
+my $FOUR_BLOCKS = 64;
+my $EIGHT_BLOCKS = 128;
+my ($vk0,$vk1,$vk2,$vk3,$vk4,$vk5,$vk6,$vk7)=("v16","v17","v18","v19","v20","v21","v22","v23");
+my ($tmp_stride,$tmp_base)=("t1","t2");
+# Loading with word order reversed
+sub reverse_order_L {
+    my $vreg = shift;
+    my $base_reg = shift;
+
+    return <<___;
+        addi $tmp_base, $base_reg, 12
+        @{[vlse32_v $vreg, $tmp_base, $tmp_stride]}
+___
+}
+
+# Storing with word order reversed
+sub reverse_order_S {
+    my $vreg = shift;
+    my $base_reg = shift;
+
+    return <<___;
+        addi $tmp_base, $base_reg, 12
+        @{[vsse32_v $vreg, $tmp_base, $tmp_stride]}
+___
+}
+
+# Load 32 round keys
+sub enc_load_key {
+    my $keys = shift;
+
+    my $code=<<___;
+    # Order of elements was adjusted in set_encrypt_key()
+    @{[vle32_v $vk0, $keys]} # rk[0:3]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk1, $keys]} # rk[4:7]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk2, $keys]} # rk[8:11]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk3, $keys]} # rk[12:15]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk4, $keys]} # rk[16:19]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk5, $keys]} # rk[20:23]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk6, $keys]} # rk[24:27]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk7, $keys]} # rk[28:31]
+___
+
+    return $code;
+}
+
+sub dec_load_key {
+    my $keys = shift;
+
+    my $code=<<___;
+    # Order of elements was adjusted in set_decrypt_key()
+    @{[vle32_v $vk7, $keys]} # rk[31:28]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk6, $keys]} # rk[27:24]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk5, $keys]} # rk[23:20]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk4, $keys]} # rk[19:16]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk3, $keys]} # rk[15:12]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk2, $keys]} # rk[11:8]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk1, $keys]} # rk[7:4]
+    addi $keys, $keys, $BLOCK_SIZE
+    @{[vle32_v $vk0, $keys]} # rk[3:0]
+___
+
+    return $code;
+}
+
+# Encrypt with all keys
+sub enc_blk {
+    my $data = shift;
+
+    my $code=<<___;
+    @{[vsm4r_vs $data, $vk0]}
+    @{[vsm4r_vs $data, $vk1]}
+    @{[vsm4r_vs $data, $vk2]}
+    @{[vsm4r_vs $data, $vk3]}
+    @{[vsm4r_vs $data, $vk4]}
+    @{[vsm4r_vs $data, $vk5]}
+    @{[vsm4r_vs $data, $vk6]}
+    @{[vsm4r_vs $data, $vk7]}
+___
+
+    return $code;
+}
+
+# Decrypt with all keys
+sub dec_blk {
+    my $data = shift;
+
+    my $code=<<___;
+    @{[vsm4r_vs $data, $vk7]}
+    @{[vsm4r_vs $data, $vk6]}
+    @{[vsm4r_vs $data, $vk5]}
+    @{[vsm4r_vs $data, $vk4]}
+    @{[vsm4r_vs $data, $vk3]}
+    @{[vsm4r_vs $data, $vk2]}
+    @{[vsm4r_vs $data, $vk1]}
+    @{[vsm4r_vs $data, $vk0]}
+___
+
+    return $code;
+}
+
+# Decrypt 4 blocks with all keys
+sub dec_4blks {
+    my $data0 = shift;
+    my $data1 = shift;
+    my $data2 = shift;
+    my $data3 = shift;
+
+    my $code=<<___;
+    @{[vsm4r_vs $data0, $vk7]}
+    @{[vsm4r_vs $data1, $vk7]}
+    @{[vsm4r_vs $data2, $vk7]}
+    @{[vsm4r_vs $data3, $vk7]}
+
+    @{[vsm4r_vs $data0, $vk6]}
+    @{[vsm4r_vs $data1, $vk6]}
+    @{[vsm4r_vs $data2, $vk6]}
+    @{[vsm4r_vs $data3, $vk6]}
+
+    @{[vsm4r_vs $data0, $vk5]}
+    @{[vsm4r_vs $data1, $vk5]}
+    @{[vsm4r_vs $data2, $vk5]}
+    @{[vsm4r_vs $data3, $vk5]}
+
+    @{[vsm4r_vs $data0, $vk4]}
+    @{[vsm4r_vs $data1, $vk4]}
+    @{[vsm4r_vs $data2, $vk4]}
+    @{[vsm4r_vs $data3, $vk4]}
+
+    @{[vsm4r_vs $data0, $vk3]}
+    @{[vsm4r_vs $data1, $vk3]}
+    @{[vsm4r_vs $data2, $vk3]}
+    @{[vsm4r_vs $data3, $vk3]}
+
+    @{[vsm4r_vs $data0, $vk2]}
+    @{[vsm4r_vs $data1, $vk2]}
+    @{[vsm4r_vs $data2, $vk2]}
+    @{[vsm4r_vs $data3, $vk2]}
+
+    @{[vsm4r_vs $data0, $vk1]}
+    @{[vsm4r_vs $data1, $vk1]}
+    @{[vsm4r_vs $data2, $vk1]}
+    @{[vsm4r_vs $data3, $vk1]}
+
+    @{[vsm4r_vs $data0, $vk0]}
+    @{[vsm4r_vs $data1, $vk0]}
+    @{[vsm4r_vs $data2, $vk0]}
+    @{[vsm4r_vs $data3, $vk0]}
+___
+
+    return $code;
+}
+
+####
+# void rv64i_zvksed_sm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+#                                   size_t len, const SM4_KEY *key,
+#                                   unsigned char *iv, int enc);
+#
+{
+my ($in,$out,$len,$keys,$ivp)=("a0","a1","a2","a3","a4");
+my ($tmp,$base)=("t0","t2");
+my ($vdata0,$vdata1,$vdata2,$vdata3,$vdata4,$vdata5,$vdata6,$vdata7)=("v1","v2","v3","v4","v5","v6","v7","v24");
+my ($vivec)=("v8");
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvksed_sm4_cbc_encrypt
+.type rv64i_zvksed_sm4_cbc_encrypt,\@function
+rv64i_zvksed_sm4_cbc_encrypt:
+    # check whether the length is a multiple of 16 and >= 16
+    li $tmp, $BLOCK_SIZE
+    bltu $len, $tmp, .Lcbc_enc_end
+    andi $tmp, $len, 15
+    bnez $tmp, .Lcbc_enc_end
+
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    # Load 32 round keys
+    @{[enc_load_key $keys]}
+
+    # Load IV
+    @{[vle32_v $vivec, $ivp]}
+# =====================================================
+# If data length ≥ 64 bytes, process 4 blocks in batch:
+# 4-block CBC encryption pipeline:
+#   1. Load 4 plaintext blocks
+#   2. Reverse bytes for SM4 endianness
+#   3. Perform XOR operation with IV or previous ciphertext block (CBC chain)
+#   4. Encrypt each data block using the enc_blk function
+#   5. Adjust the byte order and store the ciphertext block
+#   6. Update the initialization vector (IV)
+# If data length < 64 bytes, process it block by block using the Lcbc_enc_single function
+# =====================================================
+.Lcbc_enc_loop:
+    li $tmp, $FOUR_BLOCKS
+    bltu $len, $tmp, .Lcbc_enc_single
+    # Load input data0-data3
+    @{[vle32_v $vdata0, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata1, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata2, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata3, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    #XOR with IV
+    @{[vxor_vv $vdata0, $vdata0, $vivec]}
+
+    @{[vrev8_v $vdata0, $vdata0]}
+    # Encrypt with all keys
+    @{[enc_blk $vdata0]}
+    @{[vrev8_v $vdata0, $vdata0]}
+
+    # Save the ciphertext (in reverse element order)
+    li $tmp_stride, $STRIDE
+    @{[reverse_order_S $vdata0, $out]}
+    #Update IV to ciphertext block 0
+    @{[vle32_v $vivec, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata1, $vdata1, $vivec]}
+
+    @{[vrev8_v $vdata1, $vdata1]}
+    @{[enc_blk $vdata1]}
+    @{[vrev8_v $vdata1, $vdata1]}
+
+    @{[reverse_order_S $vdata1, $out]}
+
+    #Update IV to ciphertext block 1
+    @{[vle32_v $vivec, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata2, $vdata2, $vivec]}
+
+    @{[vrev8_v $vdata2, $vdata2]}
+    @{[enc_blk $vdata2]}
+    @{[vrev8_v $vdata2, $vdata2]}
+
+    @{[reverse_order_S $vdata2, $out]}
+    #Update IV to ciphertext block 2
+    @{[vle32_v $vivec, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata3, $vdata3, $vivec]}
+
+    @{[vrev8_v $vdata3, $vdata3]}
+    @{[enc_blk $vdata3]}
+    @{[vrev8_v $vdata3, $vdata3]}
+
+    @{[reverse_order_S $vdata3, $out]}
+    #Update IV to ciphertext block 3
+    @{[vle32_v $vivec, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    addi $len, $len, -$FOUR_BLOCKS
+    bnez $len, .Lcbc_enc_loop
+    #Save the final IV
+    @{[vse32_v $vivec, $ivp]}
+    ret
+
+.Lcbc_enc_single:
+    # Load input data0
+    @{[vle32_v $vdata0, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    #XOR with IV
+    @{[vxor_vv $vdata0, $vdata0, $vivec]}
+
+    @{[vrev8_v $vdata0, $vdata0]}
+    # Encrypt with all keys
+    @{[enc_blk $vdata0]}
+    @{[vrev8_v $vdata0, $vdata0]}
+
+    # Save the ciphertext (in reverse element order)
+    li $tmp_stride, $STRIDE
+    @{[reverse_order_S $vdata0, $out]}
+
+    # Update IV to ciphertext block 0
+    @{[vle32_v $vivec, $out]}
+    addi $out, $out, $BLOCK_SIZE
+    addi $len, $len, -$BLOCK_SIZE
+
+    li $tmp, $BLOCK_SIZE
+    bgeu $len, $tmp, .Lcbc_enc_single
+    # Save the final IV
+    @{[vse32_v $vivec, $ivp]}
+.Lcbc_enc_end:
+    ret
+.size rv64i_zvksed_sm4_cbc_encrypt,.-rv64i_zvksed_sm4_cbc_encrypt
+___
+
+####
+# void rv64i_zvksed_sm4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+#                                   size_t len, const SM4_KEY *key,
+#                                   unsigned char *iv, int enc);
+#
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvksed_sm4_cbc_decrypt
+.type rv64i_zvksed_sm4_cbc_decrypt,\@function
+rv64i_zvksed_sm4_cbc_decrypt:
+    # check whether the length is a multiple of 16 and >= 16
+    li $tmp, $BLOCK_SIZE
+    bltu $len, $tmp, .Lcbc_dec_end
+    andi $tmp, $len, 15
+    bnez $tmp, .Lcbc_dec_end
+
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    # Load IV (in reverse element order)
+    li $tmp_stride, $STRIDE
+    @{[reverse_order_L $vivec, $ivp]}
+
+    # Load 32 round keys
+    @{[dec_load_key $keys]}
+# =====================================================
+# If data length ≥ 128 bytes, process 8 blocks in batch:
+# 8-block CBC decryption pipeline:
+#   1. Load 8 ciphertext blocks
+#   2. Reverse bytes for SM4 endianness
+#   3. Use two calls to dec_4blks for decrypting each data block
+#   4. XOR with previous ciphertext block (CBC chain)
+#   5. Update IV and store plaintext with byte reversal
+# =====================================================
+.Lcbc_dec_loop:
+    li $tmp, $EIGHT_BLOCKS
+    bltu $len, $tmp, .Lcbc_check_64
+    # Load input data0-data7
+    @{[vle32_v $vdata0, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata1, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata2, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata3, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata4, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata5, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata6, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata7, $in]}
+    addi $in, $in, $BLOCK_SIZE
+
+    @{[vrev8_v $vdata0, $vdata0]}
+    @{[vrev8_v $vdata1, $vdata1]}
+    @{[vrev8_v $vdata2, $vdata2]}
+    @{[vrev8_v $vdata3, $vdata3]}
+    @{[vrev8_v $vdata4, $vdata4]}
+    @{[vrev8_v $vdata5, $vdata5]}
+    @{[vrev8_v $vdata6, $vdata6]}
+    @{[vrev8_v $vdata7, $vdata7]}
+    # Decrypt 8 data blocks
+    @{[dec_4blks $vdata0,$vdata1,$vdata2,$vdata3]}
+    @{[dec_4blks $vdata4,$vdata5,$vdata6,$vdata7]}
+    @{[vrev8_v $vdata0, $vdata0]}
+    @{[vrev8_v $vdata1, $vdata1]}
+    @{[vrev8_v $vdata2, $vdata2]}
+    @{[vrev8_v $vdata3, $vdata3]}
+    @{[vrev8_v $vdata4, $vdata4]}
+    @{[vrev8_v $vdata5, $vdata5]}
+    @{[vrev8_v $vdata6, $vdata6]}
+    @{[vrev8_v $vdata7, $vdata7]}
+
+    @{[vxor_vv $vdata0, $vdata0, $vivec]}
+
+    # Update ciphertext to IV (in reverse element order)
+    addi $base, $in, -128
+    @{[reverse_order_L $vivec, $base]}
+
+    # Save the plaintext (in reverse element order)
+    @{[reverse_order_S $vdata0, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata1, $vdata1, $vivec]}
+
+    addi $base, $in, -112
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata1, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata2, $vdata2, $vivec]}
+
+    addi $base, $in, -96
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata2, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata3, $vdata3, $vivec]}
+
+    addi $base, $in, -80
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata3, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata4, $vdata4, $vivec]}
+
+    addi $base, $in, -64
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata4, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata5, $vdata5, $vivec]}
+
+    addi $base, $in, -48
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata5, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata6, $vdata6, $vivec]}
+
+    addi $base, $in, -32
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata6, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata7, $vdata7, $vivec]}
+
+    addi $base, $in, -16
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata7, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    addi $len, $len, -$EIGHT_BLOCKS
+    bnez $len, .Lcbc_dec_loop
+    #Save the final IV (in reverse element order)
+    @{[reverse_order_S $vivec, $ivp]}
+    ret
+# =====================================================
+# If data length ≥ 64 bytes, process in batches of 4 blocks:
+# 4-block CBC decryption process:
+#   1. Load 4 ciphertext blocks
+#   2. Reverse byte order to fit SM4 byte order
+#   3. Decrypt each data block using the dec_4blks function
+#   4. XOR with previous ciphertext block (CBC chain)
+#   5. Update IV and store plaintext with byte reversal
+# If the data length is less than 64 bytes, process it block by block using the Lcbc_dec_single function
+# =====================================================
+.Lcbc_check_64:
+    li $tmp, $FOUR_BLOCKS
+    bltu $len, $tmp, .Lcbc_dec_single
+    # Load input data0-data3
+    @{[vle32_v $vdata0, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata1, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata2, $in]}
+    addi $in, $in, $BLOCK_SIZE
+    @{[vle32_v $vdata3, $in]}
+    addi $in, $in, $BLOCK_SIZE
+
+    @{[vrev8_v $vdata0, $vdata0]}
+    @{[vrev8_v $vdata1, $vdata1]}
+    @{[vrev8_v $vdata2, $vdata2]}
+    @{[vrev8_v $vdata3, $vdata3]}
+    # Decrypt 4 data blocks
+    @{[dec_4blks $vdata0,$vdata1,$vdata2,$vdata3]}
+    @{[vrev8_v $vdata0, $vdata0]}
+    @{[vrev8_v $vdata1, $vdata1]}
+    @{[vrev8_v $vdata2, $vdata2]}
+    @{[vrev8_v $vdata3, $vdata3]}
+
+    @{[vxor_vv $vdata0, $vdata0, $vivec]}
+
+    # Update ciphertext to IV (in reverse element order)
+    addi $base, $in, -64
+    @{[reverse_order_L $vivec, $base]}
+    # Save the plaintext (in reverse element order)
+    @{[reverse_order_S $vdata0, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata1, $vdata1, $vivec]}
+
+    addi $base, $in, -48
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata1, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata2, $vdata2, $vivec]}
+
+    addi $base, $in, -32
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata2, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    @{[vxor_vv $vdata3, $vdata3, $vivec]}
+
+    addi $base, $in, -16
+    @{[reverse_order_L $vivec, $base]}
+    @{[reverse_order_S $vdata3, $out]}
+    addi $out, $out, $BLOCK_SIZE
+
+    addi $len, $len, -$FOUR_BLOCKS
+    bnez $len, .Lcbc_check_64
+    #Save the final IV (in reverse element order)
+    @{[reverse_order_S $vivec, $ivp]}
+    ret
+
+.Lcbc_dec_single:
+    # Load input data0
+    @{[vle32_v $vdata0, $in]}
+    addi $in, $in, $BLOCK_SIZE
+
+    @{[vrev8_v $vdata0, $vdata0]}
+    # Decrypt with all keys
+    @{[dec_blk $vdata0]}
+    @{[vrev8_v $vdata0, $vdata0]}
+
+    #XOR with IV
+    @{[vxor_vv $vdata0, $vdata0, $vivec]}
+
+    # Update ciphertext to IV (in reverse element order)
+    li $tmp_stride, $STRIDE
+    addi $base, $in, -$BLOCK_SIZE
+    @{[reverse_order_L $vivec, $base]}
+    # Save the plaintext (in reverse element order)
+    @{[reverse_order_S $vdata0, $out]}
+    addi $out, $out, $BLOCK_SIZE
+    addi $len, $len, -$BLOCK_SIZE
+
+    li $tmp, $BLOCK_SIZE
+    bgeu $len, $tmp, .Lcbc_dec_single
+    #Save the final IV (in reverse element order)
+    @{[reverse_order_S $vivec, $ivp]}
+.Lcbc_dec_end:
+    ret
+.size rv64i_zvksed_sm4_cbc_decrypt,.-rv64i_zvksed_sm4_cbc_decrypt
+___
+}
+
 ####
 # int rv64i_zvksed_sm4_set_encrypt_key(const unsigned char *userKey,
 #                                      SM4_KEY *key);
@@ -94,19 +636,19 @@ rv64i_zvksed_sm4_set_encrypt_key:

     # Store round keys
     @{[vse32_v $vk0, $keys]} # rk[0:3]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vse32_v $vk1, $keys]} # rk[4:7]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vse32_v $vk2, $keys]} # rk[8:11]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vse32_v $vk3, $keys]} # rk[12:15]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vse32_v $vk4, $keys]} # rk[16:19]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vse32_v $vk5, $keys]} # rk[20:23]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vse32_v $vk6, $keys]} # rk[24:27]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vse32_v $vk7, $keys]} # rk[28:31]

     li a0, 1
@@ -150,21 +692,21 @@ rv64i_zvksed_sm4_set_decrypt_key:

     # Store round keys in reverse order
     addi $keys, $keys, 12
-    li $stride, -4
+    li $stride, $STRIDE
     @{[vsse32_v $vk7, $keys, $stride]} # rk[31:28]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vsse32_v $vk6, $keys, $stride]} # rk[27:24]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vsse32_v $vk5, $keys, $stride]} # rk[23:20]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vsse32_v $vk4, $keys, $stride]} # rk[19:16]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vsse32_v $vk3, $keys, $stride]} # rk[15:12]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vsse32_v $vk2, $keys, $stride]} # rk[11:8]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vsse32_v $vk1, $keys, $stride]} # rk[7:4]
-    addi $keys, $keys, 16
+    addi $keys, $keys, $BLOCK_SIZE
     @{[vsse32_v $vk0, $keys, $stride]} # rk[3:0]

     li a0, 1
@@ -178,8 +720,8 @@ ___
 #                               const SM4_KEY *key);
 #
 {
-my ($in,$out,$keys,$stride)=("a0","a1","a2","t0");
-my ($vdata,$vk0,$vk1,$vk2,$vk3,$vk4,$vk5,$vk6,$vk7,$vgen)=("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10");
+my ($in,$out,$keys)=("a0","a1","a2");
+my ($vdata)=("v1");
 $code .= <<___;
 .p2align 3
 .globl rv64i_zvksed_sm4_encrypt
@@ -187,42 +729,19 @@ $code .= <<___;
 rv64i_zvksed_sm4_encrypt:
     @{[vsetivli__x0_4_e32_m1_tu_mu]}

-    # Order of elements was adjusted in set_encrypt_key()
-    @{[vle32_v $vk0, $keys]} # rk[0:3]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk1, $keys]} # rk[4:7]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk2, $keys]} # rk[8:11]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk3, $keys]} # rk[12:15]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk4, $keys]} # rk[16:19]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk5, $keys]} # rk[20:23]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk6, $keys]} # rk[24:27]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk7, $keys]} # rk[28:31]
+    @{[enc_load_key $keys]}

     # Load input data
     @{[vle32_v $vdata, $in]}
     @{[vrev8_v $vdata, $vdata]}

     # Encrypt with all keys
-    @{[vsm4r_vs $vdata, $vk0]}
-    @{[vsm4r_vs $vdata, $vk1]}
-    @{[vsm4r_vs $vdata, $vk2]}
-    @{[vsm4r_vs $vdata, $vk3]}
-    @{[vsm4r_vs $vdata, $vk4]}
-    @{[vsm4r_vs $vdata, $vk5]}
-    @{[vsm4r_vs $vdata, $vk6]}
-    @{[vsm4r_vs $vdata, $vk7]}
+    @{[enc_blk $vdata]}

     # Save the ciphertext (in reverse element order)
     @{[vrev8_v $vdata, $vdata]}
-    li $stride, -4
-    addi $out, $out, 12
-    @{[vsse32_v $vdata, $out, $stride]}
+    li $tmp_stride, $STRIDE
+    @{[reverse_order_S $vdata, $out]}

     ret
 .size rv64i_zvksed_sm4_encrypt,.-rv64i_zvksed_sm4_encrypt
@@ -234,8 +753,8 @@ ___
 #                               const SM4_KEY *key);
 #
 {
-my ($in,$out,$keys,$stride)=("a0","a1","a2","t0");
-my ($vdata,$vk0,$vk1,$vk2,$vk3,$vk4,$vk5,$vk6,$vk7,$vgen)=("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10");
+my ($in,$out,$keys)=("a0","a1","a2");
+my ($vdata)=("v1");
 $code .= <<___;
 .p2align 3
 .globl rv64i_zvksed_sm4_decrypt
@@ -243,42 +762,19 @@ $code .= <<___;
 rv64i_zvksed_sm4_decrypt:
     @{[vsetivli__x0_4_e32_m1_tu_mu]}

-    # Order of elements was adjusted in set_decrypt_key()
-    @{[vle32_v $vk7, $keys]} # rk[31:28]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk6, $keys]} # rk[27:24]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk5, $keys]} # rk[23:20]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk4, $keys]} # rk[19:16]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk3, $keys]} # rk[15:12]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk2, $keys]} # rk[11:8]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk1, $keys]} # rk[7:4]
-    addi $keys, $keys, 16
-    @{[vle32_v $vk0, $keys]} # rk[3:0]
+    @{[dec_load_key $keys]}

     # Load input data
     @{[vle32_v $vdata, $in]}
     @{[vrev8_v $vdata, $vdata]}

     # Decrypt with all keys
-    @{[vsm4r_vs $vdata, $vk7]}
-    @{[vsm4r_vs $vdata, $vk6]}
-    @{[vsm4r_vs $vdata, $vk5]}
-    @{[vsm4r_vs $vdata, $vk4]}
-    @{[vsm4r_vs $vdata, $vk3]}
-    @{[vsm4r_vs $vdata, $vk2]}
-    @{[vsm4r_vs $vdata, $vk1]}
-    @{[vsm4r_vs $vdata, $vk0]}
+    @{[dec_blk $vdata]}

     # Save the plaintext (in reverse element order)
     @{[vrev8_v $vdata, $vdata]}
-    li $stride, -4
-    addi $out, $out, 12
-    @{[vsse32_v $vdata, $out, $stride]}
+    li $tmp_stride, $STRIDE
+    @{[reverse_order_S $vdata, $out]}

     ret
 .size rv64i_zvksed_sm4_decrypt,.-rv64i_zvksed_sm4_decrypt
diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
index 0e9e2e1d39..f7b436255c 100644
--- a/include/crypto/sm4_platform.h
+++ b/include/crypto/sm4_platform.h
@@ -50,6 +50,12 @@ void rv64i_zvksed_sm4_encrypt(const unsigned char *in, unsigned char *out,
                               const SM4_KEY *key);
 void rv64i_zvksed_sm4_decrypt(const unsigned char *in, unsigned char *out,
                               const SM4_KEY *key);
+void rv64i_zvksed_sm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                                  size_t len, const SM4_KEY *key,
+                                  unsigned char *iv, int enc);
+void rv64i_zvksed_sm4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+                                  size_t len, const SM4_KEY *key,
+                                  unsigned char *iv, int enc);
 #  elif (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64))
 /* Intel x86_64 support */
 #   include "internal/cryptlib.h"
diff --git a/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc b/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc
index 763d9d09dd..683b2b25a1 100644
--- a/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc
+++ b/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc
@@ -26,6 +26,7 @@ static int cipher_hw_rv64i_zvksed_sm4_initkey(PROV_CIPHER_CTX *ctx,
     SM4_KEY *ks = &sctx->ks.ks;

     ctx->ks = ks;
+
     if (ctx->enc
             || (ctx->mode != EVP_CIPH_ECB_MODE
                 && ctx->mode != EVP_CIPH_CBC_MODE)) {
@@ -38,6 +39,14 @@ static int cipher_hw_rv64i_zvksed_sm4_initkey(PROV_CIPHER_CTX *ctx,
         ctx->stream.cbc = NULL;
     }

+    if (ctx->mode == EVP_CIPH_CBC_MODE) {
+        if (ctx->enc) {
+            ctx->stream.cbc = (cbc128_f) rv64i_zvksed_sm4_cbc_encrypt;
+        } else {
+            ctx->stream.cbc = (cbc128_f) rv64i_zvksed_sm4_cbc_decrypt;
+        }
+    }
+
     return 1;
 }

diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt
index 993cf7b51e..f23129cd8d 100644
--- a/test/recipes/30-test_evp_data/evpciph_sm4.txt
+++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt
@@ -13,6 +13,12 @@ Key = 0123456789ABCDEFFEDCBA9876543210
 Plaintext  = 0123456789ABCDEFFEDCBA9876543210
 Ciphertext = 681EDF34D206965E86B3E94F536E4246

+Cipher = SM4-CBC
+Key = 0123456789ABCDEFFEDCBA9876543210
+IV  = 0123456789ABCDEFFEDCBA9876543210
+Plaintext = 0123456789ABCDEFFEDCBA9876543210
+Ciphertext = 2677F46B09C122CC975533105BD4A22A
+
 Cipher = SM4-CBC
 Key = 0123456789ABCDEFFEDCBA9876543210
 IV  = 0123456789ABCDEFFEDCBA9876543210