Commit ee9e0f6d8f for openssl.org
commit ee9e0f6d8f32a46d62feaa571f1be48a003ca2bc
Author: Zhiguo Zhou <zhiguo.zhou@intel.com>
Date: Fri Mar 6 14:39:08 2026 +0800
bn: Save/restore non-volatile registers in RSAZ AVX-IFMA code for Win64
The Windows x64 calling convention requires that registers %rsi, %rdi,
and %xmm6 through %xmm15 be preserved by the callee. This patch updates
the RSAZ-2K, 3K, and 4K AVX-IFMA assembly routines to correctly push/pop
%rsi/%rdi and save/restore the non-volatile XMM registers to the stack
when building for Win64.
This ensures ABI compliance and prevents potential data corruption or
crashes in callers that rely on these registers being preserved across
function calls.
Functions updated:
- ossl_rsaz_amm52x20_x1_avxifma256
- ossl_rsaz_amm52x20_x2_avxifma256
- ossl_extract_multiplier_2x20_win5_avx
- ossl_extract_multiplier_2x30_win5_avx
- ossl_extract_multiplier_2x40_win5_avx
Reviewed-by: Saša NedvÄ›dický <sashan@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
MergeDate: Fri Mar 13 12:15:10 2026
(Merged from https://github.com/openssl/openssl/pull/30280)
diff --git a/crypto/bn/asm/rsaz-2k-avxifma.pl b/crypto/bn/asm/rsaz-2k-avxifma.pl
index ea45d2051a..1c020842a7 100644
--- a/crypto/bn/asm/rsaz-2k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-2k-avxifma.pl
@@ -362,6 +362,23 @@ ossl_rsaz_amm52x20_x1_avxifma256:
.cfi_push %r14
push %r15
.cfi_push %r15
+___
+$code.=<<___ if ($win64);
+ push %rsi # save non-volatile registers
+ push %rdi
+ lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+ vmovapd %xmm6, `16*0`(%rsp)
+ vmovapd %xmm7, `16*1`(%rsp)
+ vmovapd %xmm8, `16*2`(%rsp)
+ vmovapd %xmm9, `16*3`(%rsp)
+ vmovapd %xmm10, `16*4`(%rsp)
+ vmovapd %xmm11, `16*5`(%rsp)
+ vmovapd %xmm12, `16*6`(%rsp)
+ vmovapd %xmm13, `16*7`(%rsp)
+ vmovapd %xmm14, `16*8`(%rsp)
+ vmovapd %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
.Lossl_rsaz_amm52x20_x1_avxifma256_body:
# Zeroing accumulators
@@ -401,6 +418,23 @@ $code.=<<___;
vmovdqu $R2_0, `4*32`($res)
vzeroupper
+___
+$code.=<<___ if ($win64);
+ vmovapd `16*0`(%rsp), %xmm6
+ vmovapd `16*1`(%rsp), %xmm7
+ vmovapd `16*2`(%rsp), %xmm8
+ vmovapd `16*3`(%rsp), %xmm9
+ vmovapd `16*4`(%rsp), %xmm10
+ vmovapd `16*5`(%rsp), %xmm11
+ vmovapd `16*6`(%rsp), %xmm12
+ vmovapd `16*7`(%rsp), %xmm13
+ vmovapd `16*8`(%rsp), %xmm14
+ vmovapd `16*9`(%rsp), %xmm15
+ lea 168(%rsp), %rsp
+ pop %rdi
+ pop %rsi
+___
+$code.=<<___;
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
@@ -553,6 +587,23 @@ ossl_rsaz_amm52x20_x2_avxifma256:
.cfi_push %r14
push %r15
.cfi_push %r15
+___
+$code.=<<___ if ($win64);
+ push %rsi # save non-volatile registers
+ push %rdi
+ lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+ vmovapd %xmm6, `16*0`(%rsp)
+ vmovapd %xmm7, `16*1`(%rsp)
+ vmovapd %xmm8, `16*2`(%rsp)
+ vmovapd %xmm9, `16*3`(%rsp)
+ vmovapd %xmm10, `16*4`(%rsp)
+ vmovapd %xmm11, `16*5`(%rsp)
+ vmovapd %xmm12, `16*6`(%rsp)
+ vmovapd %xmm13, `16*7`(%rsp)
+ vmovapd %xmm14, `16*8`(%rsp)
+ vmovapd %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
.Lossl_rsaz_amm52x20_x2_avxifma256_body:
# Zeroing accumulators
@@ -604,6 +655,23 @@ $code.=<<___;
vmovdqu $R2_1, `9*32`($res)
vzeroupper
+___
+$code.=<<___ if ($win64);
+ vmovapd `16*0`(%rsp), %xmm6
+ vmovapd `16*1`(%rsp), %xmm7
+ vmovapd `16*2`(%rsp), %xmm8
+ vmovapd `16*3`(%rsp), %xmm9
+ vmovapd `16*4`(%rsp), %xmm10
+ vmovapd `16*5`(%rsp), %xmm11
+ vmovapd `16*6`(%rsp), %xmm12
+ vmovapd `16*7`(%rsp), %xmm13
+ vmovapd `16*8`(%rsp), %xmm14
+ vmovapd `16*9`(%rsp), %xmm15
+ lea 168(%rsp), %rsp
+ pop %rdi
+ pop %rsi
+___
+$code.=<<___;
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
@@ -663,6 +731,23 @@ $code.=<<___;
ossl_extract_multiplier_2x20_win5_avx:
.cfi_startproc
endbranch
+___
+$code.=<<___ if ($win64);
+ push %rsi # save non-volatile registers
+ push %rdi
+ lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+ vmovapd %xmm6, `16*0`(%rsp)
+ vmovapd %xmm7, `16*1`(%rsp)
+ vmovapd %xmm8, `16*2`(%rsp)
+ vmovapd %xmm9, `16*3`(%rsp)
+ vmovapd %xmm10, `16*4`(%rsp)
+ vmovapd %xmm11, `16*5`(%rsp)
+ vmovapd %xmm12, `16*6`(%rsp)
+ vmovapd %xmm13, `16*7`(%rsp)
+ vmovapd %xmm14, `16*8`(%rsp)
+ vmovapd %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
vmovapd .Lones(%rip), $ones # broadcast ones
vmovq $red_tbl_idx1, $tmp_xmm
vpbroadcastq $tmp_xmm, $idx1
@@ -708,6 +793,24 @@ ___
foreach (0..9) {
$code.="vmovdqu $t[$_], `${_}*32`($out) \n";
}
+$code.=<<___;
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ vmovapd `16*0`(%rsp), %xmm6
+ vmovapd `16*1`(%rsp), %xmm7
+ vmovapd `16*2`(%rsp), %xmm8
+ vmovapd `16*3`(%rsp), %xmm9
+ vmovapd `16*4`(%rsp), %xmm10
+ vmovapd `16*5`(%rsp), %xmm11
+ vmovapd `16*6`(%rsp), %xmm12
+ vmovapd `16*7`(%rsp), %xmm13
+ vmovapd `16*8`(%rsp), %xmm14
+ vmovapd `16*9`(%rsp), %xmm15
+ lea 168(%rsp), %rsp
+ pop %rdi
+ pop %rsi
+___
$code.=<<___;
ret
.cfi_endproc
diff --git a/crypto/bn/asm/rsaz-3k-avxifma.pl b/crypto/bn/asm/rsaz-3k-avxifma.pl
index a3bc70c601..91237a0586 100644
--- a/crypto/bn/asm/rsaz-3k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-3k-avxifma.pl
@@ -855,6 +855,23 @@ $code.=<<___;
ossl_extract_multiplier_2x30_win5_avx:
.cfi_startproc
endbranch
+___
+$code.=<<___ if ($win64);
+ push %rsi # save non-volatile registers
+ push %rdi
+ lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+ vmovapd %xmm6, `16*0`(%rsp)
+ vmovapd %xmm7, `16*1`(%rsp)
+ vmovapd %xmm8, `16*2`(%rsp)
+ vmovapd %xmm9, `16*3`(%rsp)
+ vmovapd %xmm10, `16*4`(%rsp)
+ vmovapd %xmm11, `16*5`(%rsp)
+ vmovapd %xmm12, `16*6`(%rsp)
+ vmovapd %xmm13, `16*7`(%rsp)
+ vmovapd %xmm14, `16*8`(%rsp)
+ vmovapd %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
vmovapd .Lones(%rip), $ones # broadcast ones
vmovq $red_tbl_idx1, $tmp_xmm
vpbroadcastq $tmp_xmm, $idx1
@@ -928,6 +945,24 @@ foreach (8..15) {
$code.="vmovdqu $t[$_], `${_}*32`($out) \n";
}
+$code.=<<___;
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ vmovapd `16*0`(%rsp), %xmm6
+ vmovapd `16*1`(%rsp), %xmm7
+ vmovapd `16*2`(%rsp), %xmm8
+ vmovapd `16*3`(%rsp), %xmm9
+ vmovapd `16*4`(%rsp), %xmm10
+ vmovapd `16*5`(%rsp), %xmm11
+ vmovapd `16*6`(%rsp), %xmm12
+ vmovapd `16*7`(%rsp), %xmm13
+ vmovapd `16*8`(%rsp), %xmm14
+ vmovapd `16*9`(%rsp), %xmm15
+ lea 168(%rsp), %rsp
+ pop %rdi
+ pop %rsi
+___
$code.=<<___;
diff --git a/crypto/bn/asm/rsaz-4k-avxifma.pl b/crypto/bn/asm/rsaz-4k-avxifma.pl
index d5ff62db0a..9afbb3b7e2 100644
--- a/crypto/bn/asm/rsaz-4k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-4k-avxifma.pl
@@ -973,6 +973,23 @@ $code.=<<___;
ossl_extract_multiplier_2x40_win5_avx:
.cfi_startproc
endbranch
+___
+$code.=<<___ if ($win64);
+ push %rsi # save non-volatile registers
+ push %rdi
+ lea -168(%rsp), %rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+ vmovapd %xmm6, `16*0`(%rsp)
+ vmovapd %xmm7, `16*1`(%rsp)
+ vmovapd %xmm8, `16*2`(%rsp)
+ vmovapd %xmm9, `16*3`(%rsp)
+ vmovapd %xmm10, `16*4`(%rsp)
+ vmovapd %xmm11, `16*5`(%rsp)
+ vmovapd %xmm12, `16*6`(%rsp)
+ vmovapd %xmm13, `16*7`(%rsp)
+ vmovapd %xmm14, `16*8`(%rsp)
+ vmovapd %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
vmovapd .Lones(%rip), $ones # broadcast ones
vmovq $red_tbl_idx1, $tmp_xmm
vpbroadcastq $tmp_xmm, $idx1
@@ -999,6 +1016,24 @@ $code.="movq %r10, $red_tbl \n";
foreach (0..9) {
$code.="vmovdqu $t[$_], `(10+$_)*32`($out) \n";
}
+$code.=<<___;
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ vmovapd `16*0`(%rsp), %xmm6
+ vmovapd `16*1`(%rsp), %xmm7
+ vmovapd `16*2`(%rsp), %xmm8
+ vmovapd `16*3`(%rsp), %xmm9
+ vmovapd `16*4`(%rsp), %xmm10
+ vmovapd `16*5`(%rsp), %xmm11
+ vmovapd `16*6`(%rsp), %xmm12
+ vmovapd `16*7`(%rsp), %xmm13
+ vmovapd `16*8`(%rsp), %xmm14
+ vmovapd `16*9`(%rsp), %xmm15
+ lea 168(%rsp), %rsp
+ pop %rdi
+ pop %rsi
+___
$code.=<<___;
ret