Commit ee9e0f6d8f for openssl.org

commit ee9e0f6d8f32a46d62feaa571f1be48a003ca2bc
Author: Zhiguo Zhou <zhiguo.zhou@intel.com>
Date:   Fri Mar 6 14:39:08 2026 +0800

    bn: Save/restore non-volatile registers in RSAZ AVX-IFMA code for Win64

    The Windows x64 calling convention requires that registers %rsi, %rdi,
    and %xmm6 through %xmm15 be preserved by the callee. This patch updates
    the RSAZ-2K, 3K, and 4K AVX-IFMA assembly routines to correctly push/pop
    %rsi/%rdi and save/restore the non-volatile XMM registers to the stack
    when building for Win64.

    This ensures ABI compliance and prevents potential data corruption or
    crashes in callers that rely on these registers being preserved across
    function calls.

    Functions updated:
    - ossl_rsaz_amm52x20_x1_avxifma256
    - ossl_rsaz_amm52x20_x2_avxifma256
    - ossl_extract_multiplier_2x20_win5_avx
    - ossl_extract_multiplier_2x30_win5_avx
    - ossl_extract_multiplier_2x40_win5_avx

    Reviewed-by: Saša NedvÄ›dický <sashan@openssl.org>
    Reviewed-by: Tomas Mraz <tomas@openssl.org>
    MergeDate: Fri Mar 13 12:15:10 2026
    (Merged from https://github.com/openssl/openssl/pull/30280)

diff --git a/crypto/bn/asm/rsaz-2k-avxifma.pl b/crypto/bn/asm/rsaz-2k-avxifma.pl
index ea45d2051a..1c020842a7 100644
--- a/crypto/bn/asm/rsaz-2k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-2k-avxifma.pl
@@ -362,6 +362,23 @@ ossl_rsaz_amm52x20_x1_avxifma256:
 .cfi_push   %r14
     push    %r15
 .cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    push      %rsi                          # save non-volatile registers
+    push      %rdi
+    lea       -168(%rsp), %rsp              # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd   %xmm6, `16*0`(%rsp)
+    vmovapd   %xmm7, `16*1`(%rsp)
+    vmovapd   %xmm8, `16*2`(%rsp)
+    vmovapd   %xmm9, `16*3`(%rsp)
+    vmovapd   %xmm10, `16*4`(%rsp)
+    vmovapd   %xmm11, `16*5`(%rsp)
+    vmovapd   %xmm12, `16*6`(%rsp)
+    vmovapd   %xmm13, `16*7`(%rsp)
+    vmovapd   %xmm14, `16*8`(%rsp)
+    vmovapd   %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
 .Lossl_rsaz_amm52x20_x1_avxifma256_body:

     # Zeroing accumulators
@@ -401,6 +418,23 @@ $code.=<<___;
     vmovdqu   $R2_0,  `4*32`($res)

     vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
+$code.=<<___;
     mov  0(%rsp),%r15
 .cfi_restore    %r15
     mov  8(%rsp),%r14
@@ -553,6 +587,23 @@ ossl_rsaz_amm52x20_x2_avxifma256:
 .cfi_push   %r14
     push    %r15
 .cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    push    %rsi                            # save non-volatile registers
+    push    %rdi
+    lea     -168(%rsp), %rsp                # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd %xmm6, `16*0`(%rsp)
+    vmovapd %xmm7, `16*1`(%rsp)
+    vmovapd %xmm8, `16*2`(%rsp)
+    vmovapd %xmm9, `16*3`(%rsp)
+    vmovapd %xmm10, `16*4`(%rsp)
+    vmovapd %xmm11, `16*5`(%rsp)
+    vmovapd %xmm12, `16*6`(%rsp)
+    vmovapd %xmm13, `16*7`(%rsp)
+    vmovapd %xmm14, `16*8`(%rsp)
+    vmovapd %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
 .Lossl_rsaz_amm52x20_x2_avxifma256_body:

     # Zeroing accumulators
@@ -604,6 +655,23 @@ $code.=<<___;
     vmovdqu   $R2_1,  `9*32`($res)

     vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
+$code.=<<___;
     mov  0(%rsp),%r15
 .cfi_restore    %r15
     mov  8(%rsp),%r14
@@ -663,6 +731,23 @@ $code.=<<___;
 ossl_extract_multiplier_2x20_win5_avx:
 .cfi_startproc
     endbranch
+___
+$code.=<<___ if ($win64);
+    push      %rsi                          # save non-volatile registers
+    push      %rdi
+    lea       -168(%rsp), %rsp              # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd   %xmm6, `16*0`(%rsp)
+    vmovapd   %xmm7, `16*1`(%rsp)
+    vmovapd   %xmm8, `16*2`(%rsp)
+    vmovapd   %xmm9, `16*3`(%rsp)
+    vmovapd   %xmm10, `16*4`(%rsp)
+    vmovapd   %xmm11, `16*5`(%rsp)
+    vmovapd   %xmm12, `16*6`(%rsp)
+    vmovapd   %xmm13, `16*7`(%rsp)
+    vmovapd   %xmm14, `16*8`(%rsp)
+    vmovapd   %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
     vmovapd   .Lones(%rip), $ones         # broadcast ones
     vmovq $red_tbl_idx1, $tmp_xmm
     vpbroadcastq    $tmp_xmm, $idx1
@@ -708,6 +793,24 @@ ___
 foreach (0..9) {
     $code.="vmovdqu   $t[$_], `${_}*32`($out) \n";
 }
+$code.=<<___;
+    vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
 $code.=<<___;
     ret
 .cfi_endproc
diff --git a/crypto/bn/asm/rsaz-3k-avxifma.pl b/crypto/bn/asm/rsaz-3k-avxifma.pl
index a3bc70c601..91237a0586 100644
--- a/crypto/bn/asm/rsaz-3k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-3k-avxifma.pl
@@ -855,6 +855,23 @@ $code.=<<___;
 ossl_extract_multiplier_2x30_win5_avx:
 .cfi_startproc
     endbranch
+___
+$code.=<<___ if ($win64);
+    push      %rsi                          # save non-volatile registers
+    push      %rdi
+    lea       -168(%rsp), %rsp              # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd   %xmm6, `16*0`(%rsp)
+    vmovapd   %xmm7, `16*1`(%rsp)
+    vmovapd   %xmm8, `16*2`(%rsp)
+    vmovapd   %xmm9, `16*3`(%rsp)
+    vmovapd   %xmm10, `16*4`(%rsp)
+    vmovapd   %xmm11, `16*5`(%rsp)
+    vmovapd   %xmm12, `16*6`(%rsp)
+    vmovapd   %xmm13, `16*7`(%rsp)
+    vmovapd   %xmm14, `16*8`(%rsp)
+    vmovapd   %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
     vmovapd   .Lones(%rip), $ones         # broadcast ones
     vmovq    $red_tbl_idx1, $tmp_xmm
     vpbroadcastq    $tmp_xmm, $idx1
@@ -928,6 +945,24 @@ foreach (8..15) {
     $code.="vmovdqu   $t[$_], `${_}*32`($out) \n";
 }

+$code.=<<___;
+    vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___

 $code.=<<___;

diff --git a/crypto/bn/asm/rsaz-4k-avxifma.pl b/crypto/bn/asm/rsaz-4k-avxifma.pl
index d5ff62db0a..9afbb3b7e2 100644
--- a/crypto/bn/asm/rsaz-4k-avxifma.pl
+++ b/crypto/bn/asm/rsaz-4k-avxifma.pl
@@ -973,6 +973,23 @@ $code.=<<___;
 ossl_extract_multiplier_2x40_win5_avx:
 .cfi_startproc
     endbranch
+___
+$code.=<<___ if ($win64);
+    push      %rsi                          # save non-volatile registers
+    push      %rdi
+    lea       -168(%rsp), %rsp              # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovapd   %xmm6, `16*0`(%rsp)
+    vmovapd   %xmm7, `16*1`(%rsp)
+    vmovapd   %xmm8, `16*2`(%rsp)
+    vmovapd   %xmm9, `16*3`(%rsp)
+    vmovapd   %xmm10, `16*4`(%rsp)
+    vmovapd   %xmm11, `16*5`(%rsp)
+    vmovapd   %xmm12, `16*6`(%rsp)
+    vmovapd   %xmm13, `16*7`(%rsp)
+    vmovapd   %xmm14, `16*8`(%rsp)
+    vmovapd   %xmm15, `16*9`(%rsp)
+___
+$code.=<<___;
     vmovapd   .Lones(%rip), $ones         # broadcast ones
     vmovq $red_tbl_idx1, $tmp_xmm
     vpbroadcastq    $tmp_xmm, $idx1
@@ -999,6 +1016,24 @@ $code.="movq    %r10, $red_tbl \n";
 foreach (0..9) {
     $code.="vmovdqu   $t[$_], `(10+$_)*32`($out) \n";
 }
+$code.=<<___;
+    vzeroupper
+___
+$code.=<<___ if ($win64);
+    vmovapd `16*0`(%rsp), %xmm6
+    vmovapd `16*1`(%rsp), %xmm7
+    vmovapd `16*2`(%rsp), %xmm8
+    vmovapd `16*3`(%rsp), %xmm9
+    vmovapd `16*4`(%rsp), %xmm10
+    vmovapd `16*5`(%rsp), %xmm11
+    vmovapd `16*6`(%rsp), %xmm12
+    vmovapd `16*7`(%rsp), %xmm13
+    vmovapd `16*8`(%rsp), %xmm14
+    vmovapd `16*9`(%rsp), %xmm15
+    lea     168(%rsp), %rsp
+    pop     %rdi
+    pop     %rsi
+___
 $code.=<<___;

     ret