Commit ba92905c74 for openssl.org
commit ba92905c745fd459dfafc68bfc6d29d37b466f52
Author: Marcel Cornu <marcel.d.cornu@intel.com>
Date: Wed Mar 4 21:18:19 2026 +0000
ML-DSA: Preserve non-volatile XMM registers on Windows
AVX2 NTT functions clobber YMM6-YMM15. This commit preserves
the lower halves of XMM6-XMM15 required by the Windows x64 ABI.
Reviewed-by: Saša NedvÄ›dický <sashan@openssl.org>
Reviewed-by: Paul Dale <paul.dale@oracle.com>
Reviewed-by: Neil Horman <nhorman@openssl.org>
MergeDate: Wed Mar 11 15:47:49 2026
(Merged from https://github.com/openssl/openssl/pull/30160)
diff --git a/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl b/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
index 6012b34bb7..97a64db381 100644
--- a/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
+++ b/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
@@ -1592,6 +1592,16 @@ ml_dsa_inverse_degree_montgomery:
.align 32
ml_dsa_poly_ntt_mult_avx2:
.cfi_startproc
+___
+$code .= <<___ if ($win64);
+ sub \$88, %rsp
+ vmovaps %xmm8, 0(%rsp)
+ vmovaps %xmm9, 16(%rsp)
+ vmovaps %xmm10, 32(%rsp)
+ vmovaps %xmm14, 48(%rsp)
+ vmovaps %xmm15, 64(%rsp)
+___
+$code .= <<___;
vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
vpbroadcastd ml_dsa_q(%rip), %ymm15
xor %r10d, %r10d
@@ -1618,7 +1628,18 @@ $code .= <<___;
cmp \$256*4, %r10d
jb .Lmult_loop
+ # clear and restore registers
vzeroall
+___
+$code .= <<___ if ($win64);
+ vmovaps 0(%rsp), %xmm8
+ vmovaps 16(%rsp), %xmm9
+ vmovaps 32(%rsp), %xmm10
+ vmovaps 48(%rsp), %xmm14
+ vmovaps 64(%rsp), %xmm15
+ add \$88, %rsp
+___
+$code .= <<___;
ret
.cfi_endproc
.size ml_dsa_poly_ntt_mult_avx2, .-ml_dsa_poly_ntt_mult_avx2
@@ -1657,6 +1678,21 @@ $code .= <<___;
.align 32
ml_dsa_poly_ntt_avx2:
.cfi_startproc
+___
+$code .= <<___ if ($win64);
+ sub \$168, %rsp
+ vmovaps %xmm6, 0(%rsp)
+ vmovaps %xmm7, 16(%rsp)
+ vmovaps %xmm8, 32(%rsp)
+ vmovaps %xmm9, 48(%rsp)
+ vmovaps %xmm10, 64(%rsp)
+ vmovaps %xmm11, 80(%rsp)
+ vmovaps %xmm12, 96(%rsp)
+ vmovaps %xmm13, 112(%rsp)
+ vmovaps %xmm14, 128(%rsp)
+ vmovaps %xmm15, 144(%rsp)
+___
+$code .= <<___;
# move p_zetas to r11
mov %rsi, %r11
@@ -1701,7 +1737,23 @@ ___
$code .= <<___;
+ # clear and restore registers
vzeroall
+___
+$code .= <<___ if ($win64);
+ vmovaps 0(%rsp), %xmm6
+ vmovaps 16(%rsp), %xmm7
+ vmovaps 32(%rsp), %xmm8
+ vmovaps 48(%rsp), %xmm9
+ vmovaps 64(%rsp), %xmm10
+ vmovaps 80(%rsp), %xmm11
+ vmovaps 96(%rsp), %xmm12
+ vmovaps 112(%rsp), %xmm13
+ vmovaps 128(%rsp), %xmm14
+ vmovaps 144(%rsp), %xmm15
+ add \$168, %rsp
+___
+$code .= <<___;
ret
.cfi_endproc
.size ml_dsa_poly_ntt_avx2, .-ml_dsa_poly_ntt_avx2
@@ -1738,6 +1790,21 @@ $code .= <<___;
.align 32
ml_dsa_poly_ntt_inverse_avx2:
.cfi_startproc
+___
+$code .= <<___ if ($win64);
+ sub \$168, %rsp
+ vmovaps %xmm6, 0(%rsp)
+ vmovaps %xmm7, 16(%rsp)
+ vmovaps %xmm8, 32(%rsp)
+ vmovaps %xmm9, 48(%rsp)
+ vmovaps %xmm10, 64(%rsp)
+ vmovaps %xmm11, 80(%rsp)
+ vmovaps %xmm12, 96(%rsp)
+ vmovaps %xmm13, 112(%rsp)
+ vmovaps %xmm14, 128(%rsp)
+ vmovaps %xmm15, 144(%rsp)
+___
+$code .= <<___;
lea zetas_inverse(%rip), %r11
vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
@@ -1787,7 +1854,23 @@ ___
&intt_levels5to7(24*4);
$code .= <<___;
+ # clear and restore registers
vzeroall
+___
+$code .= <<___ if ($win64);
+ vmovaps 0(%rsp), %xmm6
+ vmovaps 16(%rsp), %xmm7
+ vmovaps 32(%rsp), %xmm8
+ vmovaps 48(%rsp), %xmm9
+ vmovaps 64(%rsp), %xmm10
+ vmovaps 80(%rsp), %xmm11
+ vmovaps 96(%rsp), %xmm12
+ vmovaps 112(%rsp), %xmm13
+ vmovaps 128(%rsp), %xmm14
+ vmovaps 144(%rsp), %xmm15
+ add \$168, %rsp
+___
+$code .= <<___;
ret
.cfi_endproc
.size ml_dsa_poly_ntt_inverse_avx2, .-ml_dsa_poly_ntt_inverse_avx2