Commit fc585a277b for openssl.org
commit fc585a277b17cff3ab8f43626869d4629e1f71f5
Author: Marcel Cornu <marcel.d.cornu@intel.com>
Date: Thu Mar 5 14:15:55 2026 +0000
ML-DSA: Add Win64 SEH unwind support to AVX2 NTT functions
Add ntt_se_handler and .pdata/.xdata sections for the three AVX2 NTT
functions.
Signed-off-by: Marcel Cornu <marcel.d.cornu@intel.com>
Reviewed-by: Saša NedvÄ›dický <sashan@openssl.org>
Reviewed-by: Paul Dale <paul.dale@oracle.com>
Reviewed-by: Neil Horman <nhorman@openssl.org>
MergeDate: Wed Mar 11 15:47:52 2026
(Merged from https://github.com/openssl/openssl/pull/30160)
diff --git a/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl b/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
index 97a64db381..9683930760 100644
--- a/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
+++ b/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
@@ -1594,14 +1594,20 @@ ml_dsa_poly_ntt_mult_avx2:
.cfi_startproc
___
$code .= <<___ if ($win64);
- sub \$88, %rsp
- vmovaps %xmm8, 0(%rsp)
- vmovaps %xmm9, 16(%rsp)
- vmovaps %xmm10, 32(%rsp)
- vmovaps %xmm14, 48(%rsp)
- vmovaps %xmm15, 64(%rsp)
+ lea -168(%rax), %rsp
+ vmovaps %xmm6, 0(%rsp)
+ vmovaps %xmm7, 16(%rsp)
+ vmovaps %xmm8, 32(%rsp)
+ vmovaps %xmm9, 48(%rsp)
+ vmovaps %xmm10, 64(%rsp)
+ vmovaps %xmm11, 80(%rsp)
+ vmovaps %xmm12, 96(%rsp)
+ vmovaps %xmm13, 112(%rsp)
+ vmovaps %xmm14, 128(%rsp)
+ vmovaps %xmm15, 144(%rsp)
___
$code .= <<___;
+.Lntt_mult_body:
vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
vpbroadcastd ml_dsa_q(%rip), %ymm15
xor %r10d, %r10d
@@ -1632,14 +1638,20 @@ $code .= <<___;
vzeroall
___
$code .= <<___ if ($win64);
- vmovaps 0(%rsp), %xmm8
- vmovaps 16(%rsp), %xmm9
- vmovaps 32(%rsp), %xmm10
- vmovaps 48(%rsp), %xmm14
- vmovaps 64(%rsp), %xmm15
- add \$88, %rsp
+ vmovaps 0(%rsp), %xmm6
+ vmovaps 16(%rsp), %xmm7
+ vmovaps 32(%rsp), %xmm8
+ vmovaps 48(%rsp), %xmm9
+ vmovaps 64(%rsp), %xmm10
+ vmovaps 80(%rsp), %xmm11
+ vmovaps 96(%rsp), %xmm12
+ vmovaps 112(%rsp), %xmm13
+ vmovaps 128(%rsp), %xmm14
+ vmovaps 144(%rsp), %xmm15
+ lea (%rax), %rsp
___
$code .= <<___;
+.Lntt_mult_epilogue:
ret
.cfi_endproc
.size ml_dsa_poly_ntt_mult_avx2, .-ml_dsa_poly_ntt_mult_avx2
@@ -1680,7 +1692,7 @@ ml_dsa_poly_ntt_avx2:
.cfi_startproc
___
$code .= <<___ if ($win64);
- sub \$168, %rsp
+ lea -168(%rax), %rsp
vmovaps %xmm6, 0(%rsp)
vmovaps %xmm7, 16(%rsp)
vmovaps %xmm8, 32(%rsp)
@@ -1693,6 +1705,7 @@ $code .= <<___ if ($win64);
vmovaps %xmm15, 144(%rsp)
___
$code .= <<___;
+.Lntt_body:
# move p_zetas to r11
mov %rsi, %r11
@@ -1751,9 +1764,10 @@ $code .= <<___ if ($win64);
vmovaps 112(%rsp), %xmm13
vmovaps 128(%rsp), %xmm14
vmovaps 144(%rsp), %xmm15
- add \$168, %rsp
+ lea (%rax), %rsp
___
$code .= <<___;
+.Lntt_epilogue:
ret
.cfi_endproc
.size ml_dsa_poly_ntt_avx2, .-ml_dsa_poly_ntt_avx2
@@ -1792,7 +1806,7 @@ ml_dsa_poly_ntt_inverse_avx2:
.cfi_startproc
___
$code .= <<___ if ($win64);
- sub \$168, %rsp
+ lea -168(%rax), %rsp
vmovaps %xmm6, 0(%rsp)
vmovaps %xmm7, 16(%rsp)
vmovaps %xmm8, 32(%rsp)
@@ -1805,6 +1819,7 @@ $code .= <<___ if ($win64);
vmovaps %xmm15, 144(%rsp)
___
$code .= <<___;
+.Lintt_body:
lea zetas_inverse(%rip), %r11
vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
@@ -1868,14 +1883,129 @@ $code .= <<___ if ($win64);
vmovaps 112(%rsp), %xmm13
vmovaps 128(%rsp), %xmm14
vmovaps 144(%rsp), %xmm15
- add \$168, %rsp
+ lea (%rax), %rsp
___
$code .= <<___;
+.Lintt_epilogue:
ret
.cfi_endproc
.size ml_dsa_poly_ntt_inverse_avx2, .-ml_dsa_poly_ntt_inverse_avx2
___
+# Windows SEH exception handler and unwind data
+if ($win64) {
+my $context = "%r8";
+my $disp = "%r9";
+
+$code .= <<___;
+.extern __imp_RtlVirtualUnwind
+.type ntt_se_handler,\@abi-omnipotent
+.align 16
+ntt_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64, %rsp
+
+ mov 120($context), %rax # context->Rax = original %rsp (saved by xlate preamble)
+ mov 248($context), %rbx # context->Rip
+
+ mov 8($disp), %rsi # disp->ImageBase
+ mov 56($disp), %r11 # disp->HandlerData
+
+ mov 0(%r11), %r10d # HandlerData[0]: body label (rva)
+ lea (%rsi,%r10), %r10
+ cmp %r10, %rbx # Rip < body?
+ jb .Lntt_in_prologue
+
+ mov 4(%r11), %r10d # HandlerData[1]: epilogue label (rva)
+ lea (%rsi,%r10), %r10
+ cmp %r10, %rbx # Rip >= epilogue?
+ jae .Lntt_in_prologue
+
+ # In function body: XMM6-XMM15 are saved at 0..144(new_rsp).
+ # context->Rsp = new_rsp = rax - 168
+ mov 152($context), %rsi # context->Rsp = new_rsp (address of XMM saves)
+ lea 512($context), %rdi # &context->Xmm6
+ mov \$20, %ecx # 10 XMMs * 2 qwords = 20 qwords
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lntt_in_prologue:
+ # Restore rdi and rsi saved by xlate preamble in shadow space
+ mov 8(%rax), %rcx
+ mov 16(%rax), %rdx
+ mov %rcx, 176($context) # context->Rdi
+ mov %rdx, 168($context) # context->Rsi
+ mov %rax, 152($context) # context->Rsp = original %rsp
+
+ mov 40($disp), %rdi # disp->ContextRecord
+ mov $context, %rsi
+ mov \$154, %ecx # sizeof(CONTEXT)/8
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp, %rsi
+ xor %rcx, %rcx # UNW_FLAG_NHANDLER
+ mov 8(%rsi), %rdx # disp->ImageBase
+ mov 0(%rsi), %r8 # disp->ControlPc
+ mov 16(%rsi), %r9 # disp->FunctionEntry
+ mov 40(%rsi), %r10 # disp->ContextRecord
+ lea 56(%rsi), %r11 # &disp->HandlerData
+ lea 24(%rsi), %r12 # &disp->EstablisherFrame
+ mov %r10, 32(%rsp)
+ mov %r11, 40(%rsp)
+ mov %r12, 48(%rsp)
+ mov %rcx, 56(%rsp)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1, %eax # ExceptionContinueSearch
+ add \$64, %rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size ntt_se_handler,.-ntt_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_ml_dsa_poly_ntt_mult_avx2
+ .rva .LSEH_end_ml_dsa_poly_ntt_mult_avx2
+ .rva .LSEH_info_ml_dsa_poly_ntt_mult_avx2
+ .rva .LSEH_begin_ml_dsa_poly_ntt_avx2
+ .rva .LSEH_end_ml_dsa_poly_ntt_avx2
+ .rva .LSEH_info_ml_dsa_poly_ntt_avx2
+ .rva .LSEH_begin_ml_dsa_poly_ntt_inverse_avx2
+ .rva .LSEH_end_ml_dsa_poly_ntt_inverse_avx2
+ .rva .LSEH_info_ml_dsa_poly_ntt_inverse_avx2
+
+.section .xdata
+.align 8
+.LSEH_info_ml_dsa_poly_ntt_mult_avx2:
+ .byte 9,0,0,0
+ .rva ntt_se_handler
+ .rva .Lntt_mult_body,.Lntt_mult_epilogue
+.LSEH_info_ml_dsa_poly_ntt_avx2:
+ .byte 9,0,0,0
+ .rva ntt_se_handler
+ .rva .Lntt_body,.Lntt_epilogue
+.LSEH_info_ml_dsa_poly_ntt_inverse_avx2:
+ .byte 9,0,0,0
+ .rva ntt_se_handler
+ .rva .Lintt_body,.Lintt_epilogue
+___
+}
+
}}} else {{{
# When AVX2 is not available, output stub functions
# The capable function returns 0, and the operation functions trap if called