Commit fc585a277b for openssl.org

commit fc585a277b17cff3ab8f43626869d4629e1f71f5
Author: Marcel Cornu <marcel.d.cornu@intel.com>
Date:   Thu Mar 5 14:15:55 2026 +0000

    ML-DSA: Add Win64 SEH unwind support to AVX2 NTT functions

    Add ntt_se_handler and .pdata/.xdata sections for the three AVX2 NTT
    functions.

    Signed-off-by: Marcel Cornu <marcel.d.cornu@intel.com>

    Reviewed-by: Saša NedvÄ›dický <sashan@openssl.org>
    Reviewed-by: Paul Dale <paul.dale@oracle.com>
    Reviewed-by: Neil Horman <nhorman@openssl.org>
    MergeDate: Wed Mar 11 15:47:52 2026
    (Merged from https://github.com/openssl/openssl/pull/30160)

diff --git a/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl b/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
index 97a64db381..9683930760 100644
--- a/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
+++ b/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
@@ -1594,14 +1594,20 @@ ml_dsa_poly_ntt_mult_avx2:
 .cfi_startproc
 ___
 $code .= <<___ if ($win64);
-    sub     \$88, %rsp
-    vmovaps %xmm8,  0(%rsp)
-    vmovaps %xmm9,  16(%rsp)
-    vmovaps %xmm10, 32(%rsp)
-    vmovaps %xmm14, 48(%rsp)
-    vmovaps %xmm15, 64(%rsp)
+    lea     -168(%rax), %rsp
+    vmovaps %xmm6,   0(%rsp)
+    vmovaps %xmm7,   16(%rsp)
+    vmovaps %xmm8,   32(%rsp)
+    vmovaps %xmm9,   48(%rsp)
+    vmovaps %xmm10,  64(%rsp)
+    vmovaps %xmm11,  80(%rsp)
+    vmovaps %xmm12,  96(%rsp)
+    vmovaps %xmm13,  112(%rsp)
+    vmovaps %xmm14,  128(%rsp)
+    vmovaps %xmm15,  144(%rsp)
 ___
 $code .= <<___;
+.Lntt_mult_body:
     vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
     vpbroadcastd ml_dsa_q(%rip), %ymm15
     xor %r10d, %r10d
@@ -1632,14 +1638,20 @@ $code .= <<___;
     vzeroall
 ___
 $code .= <<___ if ($win64);
-    vmovaps 0(%rsp),  %xmm8
-    vmovaps 16(%rsp), %xmm9
-    vmovaps 32(%rsp), %xmm10
-    vmovaps 48(%rsp), %xmm14
-    vmovaps 64(%rsp), %xmm15
-    add     \$88, %rsp
+    vmovaps 0(%rsp),   %xmm6
+    vmovaps 16(%rsp),  %xmm7
+    vmovaps 32(%rsp),  %xmm8
+    vmovaps 48(%rsp),  %xmm9
+    vmovaps 64(%rsp),  %xmm10
+    vmovaps 80(%rsp),  %xmm11
+    vmovaps 96(%rsp),  %xmm12
+    vmovaps 112(%rsp), %xmm13
+    vmovaps 128(%rsp), %xmm14
+    vmovaps 144(%rsp), %xmm15
+    lea     (%rax), %rsp
 ___
 $code .= <<___;
+.Lntt_mult_epilogue:
     ret
 .cfi_endproc
 .size   ml_dsa_poly_ntt_mult_avx2, .-ml_dsa_poly_ntt_mult_avx2
@@ -1680,7 +1692,7 @@ ml_dsa_poly_ntt_avx2:
 .cfi_startproc
 ___
 $code .= <<___ if ($win64);
-    sub     \$168, %rsp
+    lea     -168(%rax), %rsp
     vmovaps %xmm6,   0(%rsp)
     vmovaps %xmm7,   16(%rsp)
     vmovaps %xmm8,   32(%rsp)
@@ -1693,6 +1705,7 @@ $code .= <<___ if ($win64);
     vmovaps %xmm15,  144(%rsp)
 ___
 $code .= <<___;
+.Lntt_body:

     # move p_zetas to r11
     mov %rsi, %r11
@@ -1751,9 +1764,10 @@ $code .= <<___ if ($win64);
     vmovaps 112(%rsp), %xmm13
     vmovaps 128(%rsp), %xmm14
     vmovaps 144(%rsp), %xmm15
-    add     \$168, %rsp
+    lea     (%rax), %rsp
 ___
 $code .= <<___;
+.Lntt_epilogue:
     ret
 .cfi_endproc
 .size   ml_dsa_poly_ntt_avx2, .-ml_dsa_poly_ntt_avx2
@@ -1792,7 +1806,7 @@ ml_dsa_poly_ntt_inverse_avx2:
 .cfi_startproc
 ___
 $code .= <<___ if ($win64);
-    sub     \$168, %rsp
+    lea     -168(%rax), %rsp
     vmovaps %xmm6,   0(%rsp)
     vmovaps %xmm7,   16(%rsp)
     vmovaps %xmm8,   32(%rsp)
@@ -1805,6 +1819,7 @@ $code .= <<___ if ($win64);
     vmovaps %xmm15,  144(%rsp)
 ___
 $code .= <<___;
+.Lintt_body:
     lea zetas_inverse(%rip), %r11

     vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
@@ -1868,14 +1883,129 @@ $code .= <<___ if ($win64);
     vmovaps 112(%rsp), %xmm13
     vmovaps 128(%rsp), %xmm14
     vmovaps 144(%rsp), %xmm15
-    add     \$168, %rsp
+    lea     (%rax), %rsp
 ___
 $code .= <<___;
+.Lintt_epilogue:
     ret
 .cfi_endproc
 .size   ml_dsa_poly_ntt_inverse_avx2, .-ml_dsa_poly_ntt_inverse_avx2
 ___

+# Windows SEH exception handler and unwind data
+if ($win64) {
+my $context = "%r8";
+my $disp    = "%r9";
+
+$code .= <<___;
+.extern __imp_RtlVirtualUnwind
+.type   ntt_se_handler,\@abi-omnipotent
+.align  16
+ntt_se_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64, %rsp
+
+    mov     120($context), %rax     # context->Rax = original %rsp (saved by xlate preamble)
+    mov     248($context), %rbx     # context->Rip
+
+    mov     8($disp), %rsi          # disp->ImageBase
+    mov     56($disp), %r11         # disp->HandlerData
+
+    mov     0(%r11), %r10d          # HandlerData[0]: body label (rva)
+    lea     (%rsi,%r10), %r10
+    cmp     %r10, %rbx              # Rip < body?
+    jb      .Lntt_in_prologue
+
+    mov     4(%r11), %r10d          # HandlerData[1]: epilogue label (rva)
+    lea     (%rsi,%r10), %r10
+    cmp     %r10, %rbx              # Rip >= epilogue?
+    jae     .Lntt_in_prologue
+
+    # In function body: XMM6-XMM15 are saved at 0..144(new_rsp).
+    # context->Rsp = new_rsp = rax - 168
+    mov     152($context), %rsi     # context->Rsp = new_rsp (address of XMM saves)
+    lea     512($context), %rdi     # &context->Xmm6
+    mov     \$20, %ecx              # 10 XMMs * 2 qwords = 20 qwords
+    .long   0xa548f3fc              # cld; rep movsq
+
+.Lntt_in_prologue:
+    # Restore rdi and rsi saved by xlate preamble in shadow space
+    mov     8(%rax), %rcx
+    mov     16(%rax), %rdx
+    mov     %rcx, 176($context)     # context->Rdi
+    mov     %rdx, 168($context)     # context->Rsi
+    mov     %rax, 152($context)     # context->Rsp = original %rsp
+
+    mov     40($disp), %rdi         # disp->ContextRecord
+    mov     $context, %rsi
+    mov     \$154, %ecx             # sizeof(CONTEXT)/8
+    .long   0xa548f3fc              # cld; rep movsq
+
+    mov     $disp, %rsi
+    xor     %rcx, %rcx              # UNW_FLAG_NHANDLER
+    mov     8(%rsi), %rdx           # disp->ImageBase
+    mov     0(%rsi), %r8            # disp->ControlPc
+    mov     16(%rsi), %r9           # disp->FunctionEntry
+    mov     40(%rsi), %r10          # disp->ContextRecord
+    lea     56(%rsi), %r11          # &disp->HandlerData
+    lea     24(%rsi), %r12          # &disp->EstablisherFrame
+    mov     %r10, 32(%rsp)
+    mov     %r11, 40(%rsp)
+    mov     %r12, 48(%rsp)
+    mov     %rcx, 56(%rsp)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1, %eax               # ExceptionContinueSearch
+    add     \$64, %rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   ntt_se_handler,.-ntt_se_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_ml_dsa_poly_ntt_mult_avx2
+    .rva    .LSEH_end_ml_dsa_poly_ntt_mult_avx2
+    .rva    .LSEH_info_ml_dsa_poly_ntt_mult_avx2
+    .rva    .LSEH_begin_ml_dsa_poly_ntt_avx2
+    .rva    .LSEH_end_ml_dsa_poly_ntt_avx2
+    .rva    .LSEH_info_ml_dsa_poly_ntt_avx2
+    .rva    .LSEH_begin_ml_dsa_poly_ntt_inverse_avx2
+    .rva    .LSEH_end_ml_dsa_poly_ntt_inverse_avx2
+    .rva    .LSEH_info_ml_dsa_poly_ntt_inverse_avx2
+
+.section    .xdata
+.align  8
+.LSEH_info_ml_dsa_poly_ntt_mult_avx2:
+    .byte   9,0,0,0
+    .rva    ntt_se_handler
+    .rva    .Lntt_mult_body,.Lntt_mult_epilogue
+.LSEH_info_ml_dsa_poly_ntt_avx2:
+    .byte   9,0,0,0
+    .rva    ntt_se_handler
+    .rva    .Lntt_body,.Lntt_epilogue
+.LSEH_info_ml_dsa_poly_ntt_inverse_avx2:
+    .byte   9,0,0,0
+    .rva    ntt_se_handler
+    .rva    .Lintt_body,.Lintt_epilogue
+___
+}
+
 }}} else {{{
 # When AVX2 is not available, output stub functions
 # The capable function returns 0, and the operation functions trap if called