Commit 3c07701bf5 for openssl.org

commit 3c07701bf542ddfcae2a9d5623fd68b33981fb78
Author: Tomasz Kantecki <tomasz.kantecki@intel.com>
Date:   Tue Feb 24 13:39:47 2026 +0000

    Add AVX2 optimized assembly for ML-DSA NTT

    This commit introduces AVX2-vectorized assembly implementations of the
    Number Theoretic Transform (NTT) operations used in ML-DSA (FIPS 204).
    These optimizations improve performance of ML-DSA key generation,
    signing, and verification operations on x86_64 platforms
    with AVX2 support.

    The implementation adds the following functions:
    - ml_dsa_poly_ntt_avx2: Forward NTT transformation
    - ml_dsa_poly_ntt_inverse_avx2: Inverse NTT transformation
    - ml_dsa_poly_ntt_mult_avx2: NTT-domain polynomial multiplication

    Key implementation details:
    - Uses YMM registers to process 8 32-bit coefficients in parallel
    - Employs Montgomery reduction for modular arithmetic
    - Implements NTT butterfly operations across multiple transform levels
    - Includes dedicated zeta table for INTT to reduce cycles
    - Runtime capability check via ml_dsa_ntt_avx2_capable() using
      OPENSSL_ia32cap_P to detect AVX2 support

    The C code in ml_dsa_ntt.c is updated to dispatch to AVX2
    implementations
    at runtime when available, with automatic fallback to the portable C
    implementation on platforms without AVX2 support.

    Build system changes:
    - Added GENERATE rule for ml_dsa_ntt-x86_64.s from Perl assembly
    - Conditional assembly inclusion based on target architecture
    - Works with both libcrypto and FIPS provider builds

    Co-authored-by: Marcel Cornu <marcel.d.cornu@intel.com>

    Reviewed-by: Saša NedvÄ›dický <sashan@openssl.org>
    Reviewed-by: Paul Dale <paul.dale@oracle.com>
    Reviewed-by: Neil Horman <nhorman@openssl.org>
    MergeDate: Wed Mar 11 15:47:40 2026
    (Merged from https://github.com/openssl/openssl/pull/30160)

diff --git a/CHANGES.md b/CHANGES.md
index b6e90cb5f3..0c779e276a 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -31,7 +31,9 @@ OpenSSL Releases

 ### Changes between 4.0 and 4.1 [xx XXX xxxx]

- * none yet
+ * Added AVX2 optimized ML-DSA NTT operations on `x86_64`.
+
+   *Marcel Cornu and Tomasz Kantecki*

 ### Changes between 3.6 and 4.0 [xx XXX xxxx]

diff --git a/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl b/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
new file mode 100644
index 0000000000..c412e5e6c2
--- /dev/null
+++ b/crypto/ml_dsa/asm/ml_dsa_ntt-x86_64.pl
@@ -0,0 +1,1807 @@
+#! /usr/bin/env perl
+#
+# Copyright 2026 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2026 Intel Corporation. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+###############################################################################
+# ML-DSA AVX2 Vectorized NTT/INTT Assembly Routines
+#
+# Description:
+#   This file provides optimized x86_64 assembly implementations of the Number
+#   Theoretic Transform (NTT) and inverse NTT (INTT) and their modular Montgomery
+#   reduction building blocks for the ML-DSA signature scheme.
+#
+#   The routines are vectorized using AVX2 instructions, performing
+#   modular arithmetic and butterfly operations on multiple
+#   coefficients in parallel. The mathematical structure and transformations strictly
+#   follow those implemented in the corresponding C code (ml_dsa_ntt.c),
+#   ensuring that this file provides a drop-in, performant backend using the
+#   same algorithms and data layout.
+#   There is dedicated zeta table used for INTT only. It is essentially
+#   reformatted original table from ml_dsa_ntt.c file to reduce INTT compute cycles.
+#
+#   This module supports both the forward (NTT) and inverse (INTT)
+#   polynomial transforms, as well as element-wise NTT-domain polynomial
+#   multiplication, compatible with the ML-DSA cryptographic protocol.
+#
+#   Step, offset and zeta index details provided for NTT and INTT level operations
+#   correspond directly to the original C implementations from ml_dsa_ntt.c file.
+#
+# Notes:
+#   - Uses AVX2 instructions and YMM registers that accommodate 8 32-bit coefficients
+#   - Must be kept functionally synchronized with the math and
+#     interface of ml_dsa_ntt.c
+#   - Data structures, twiddle factors ("zetas"), and constants must match
+#     those in the C implementation
+###############################################################################
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$win64 = 0;
+$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$avx2 = 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+$dir = $1;
+($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
+  or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
+  or die "can't locate x86_64-xlate.pl";
+
+# Check for AVX2 support in assembler
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+  $avx2 = ($1 >= 2.22);
+}
+
+if (!$avx2
+  && $win64
+  && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
+  && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
+{
+  $avx2 = ($1 >= 2.10);
+}
+
+if (!$avx2 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
+    $avx2 = ($2>=3.3); # minimal tested version for AVX2
+}
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
+  or die "can't call $xlate: $!";
+*STDOUT = *OUT;
+
+# ML-DSA constants
+my $ML_DSA_Q = 8380417;             # Q: 2^23 - 2^13 + 1
+my $ML_DSA_Q_NEG_INV = 4236238847;  # -Q^{-1} mod 2^32
+
+#  The multiplicative inverse of 256 mod Q, in Montgomery form is
+#  ((256^{-1} mod Q) * ((2^32 * 2^32) mod Q)) mod Q = (8347681 * 2365951) mod 8380417
+my $inverse_degree_montgomery = 41978;
+
+if ($avx2>0) {{{
+
+# avx2 feature bit
+my $avx2_mask = (1<<5);
+
+$code .= <<___;
+.text
+
+.extern OPENSSL_ia32cap_P
+
+.globl  ml_dsa_ntt_avx2_capable
+.type   ml_dsa_ntt_avx2_capable,\@abi-omnipotent
+.align 32
+ml_dsa_ntt_avx2_capable:
+    mov     OPENSSL_ia32cap_P+8(%rip), %rcx
+    xor     %eax, %eax
+    and     \$$avx2_mask, %ecx
+    cmovnz  %ecx, %eax
+    ret
+.size   ml_dsa_ntt_avx2_capable, .-ml_dsa_ntt_avx2_capable
+___
+
+###############################################################################
+# multiply_8x8_mod_Q
+#
+# Description:
+#   The inputs (A and B) are in YMM registers, where each packs 8 32-bit integers.
+#   The result, (A * B mod Q), is also in a YMM register.
+#   This routine is used as the core modular multiplication step in
+#   NTT butterfly operations.
+#
+# Parameters:
+#   inA      - Input YMM containing 8 32-bit unsigned values (A)
+#   inB      - Input YMM containing 8 32-bit unsigned values (B)
+#   out      - Output YMM for (A * B mod Q)
+#   tmp0-tmp2 - Temporary registers for intermediate values
+#   q_neg_inv- YMM containing -Q^{-1} mod 2^32 (for Montgomery reduction)
+#   q        - YMM containing modulus Q
+#
+# Output:
+#   out      - Resulting 8 packed 32-bit integers, each (A * B) mod Q
+#
+# Side effects:
+#   Clobbers tmp0, tmp1, tmp2
+#
+# Notes:
+#   inA or inB can also be used as out
+###############################################################################
+sub multiply_8x8_mod_Q {
+    my ($inA, $inB, $out,
+        $tmp0, $tmp1, $tmp2,
+        $q_neg_inv, $q) = @_;
+
+    $code .= <<___;
+    # Multiply A x B
+    vpmuludq $inA, $inB, $tmp0  # multiply even indexes
+    vmovshdup $inA, $tmp1
+    vmovshdup $inB, $tmp2
+    vpmuludq $tmp1, $tmp2, $tmp1  # multiply odd indexes
+
+    # Montgomery reduction: t1 = (A x B)[31..0] x Qinv
+    vpmuludq $q_neg_inv, $tmp0, $out
+    vpmuludq $q_neg_inv, $tmp1, $tmp2
+
+    # t2 = t1[31..0] x Q
+    vpmuludq $out, $q, $out
+    vpmuludq $tmp2, $q, $tmp2
+
+    # t3 = (A x B) + t2
+    vpaddq  $out, $tmp0, $out
+    vpaddq  $tmp2, $tmp1, $tmp1
+
+    # out = t3 >> 32
+    vmovshdup $out, $out
+    vpblendd \$0xAA, $tmp1, $out, $out
+
+    # out can be between Q and 2Q, do final reduction
+    vpcmpgtd $out, $q, $tmp0    # $tmp0 = $q > $out ? 0xffffffff : 0
+    vpandn $q, $tmp0, $tmp0     # $tmp0 = ~$tmp0 & $q
+    vpsubd $tmp0, $out, $out    # $out -= $tmp0
+___
+}
+
+###############################################################################
+###############################################################################
+###
+### NTT (Number Theoretic Transform)
+###
+###############################################################################
+###############################################################################
+
+###############################################################################
+# ntt_butterfly
+#
+# Description:
+#   Performs the butterfly operation for a single NTT stage on two input YMM's,
+#   applying a twiddle factor ("zetas"). This is the core step in NTT layers:
+#     - Computes t_odd = w_odd * zetas mod Q (Montgomery form)
+#     - n_even = w_even + t_odd mod Q
+#     - n_odd  = (w_even + Q) - t_odd mod Q
+#   Uses AVX2 instructions to process 8 coefficients at a time.
+#
+# Parameters:
+#   w_even  - YMM containing the "even" values
+#   w_odd   - YMM containing the "odd" values
+#   zetas   - YMM containing the twiddle factor(s)
+#   n_even  - Output YMM for the updated even coefficients
+#   n_odd   - Output YMM for the updated odd coefficients
+#   tmp0, tmp1, tmp2, tmp3 - Temporary registers for intermediate values
+#   q_neg_inv - YMM with -Q^{-1} mod 2^32
+#   q       - YMM with modulus Q
+#
+# Output:
+#   n_even  - Updated even coefficients after butterfly and reduction
+#   n_odd   - Updated odd coefficients after butterfly and reduction
+#
+# Side effects:
+#   Clobbers tmp0, tmp1, tmp2, tmp3
+###############################################################################
+sub ntt_butterfly {
+    my ($w_even, $w_odd,
+        $zetas,
+        $n_even, $n_odd,
+        $tmp0, $tmp1, $tmp2, $tmp3,
+        $q_neg_inv, $q) = @_;
+
+    &multiply_8x8_mod_Q($w_odd, # A
+                        $zetas, # B
+                        $tmp0,  # out (AxB)
+                        $tmp1, $tmp2, $tmp3, # tmp
+                        $q_neg_inv, $q);  # qinv, q
+
+    $code .= <<___;
+
+    # t_odd = $tmp0
+
+    # compute new w_odd (n_odd): (w_even + Q) - t_odd
+    vpaddd  $q, $w_even, $tmp1
+    vpsubd  $tmp0, $tmp1, $n_odd
+
+    # compute new w_even (n_even): w_even + t_odd
+    vpaddd  $w_even, $tmp0, $n_even
+
+    # reduce n_even & n_odd
+    # - results can be between Q and 2Q
+    vpcmpgtd $n_even, $q, $tmp0 # $tmp0 = $q > $n_even ? 0xffffffff : 0
+    vpcmpgtd $n_odd, $q, $tmp1  # $tmp1 = $q > $n_odd ? 0xffffffff : 0
+    vpandn $q, $tmp0, $tmp0     # $tmp0 = ~$tmp0 & $q
+    vpandn $q, $tmp1, $tmp1     # $tmp1 = ~$tmp1 & $q
+    vpsubd $tmp0, $n_even, $n_even  # $n_even -= $tmp0
+    vpsubd $tmp1, $n_odd, $n_odd    # $n_odd -= $tmp1
+___
+}
+
+###############################################################################
+# ntt_levels0to2
+#
+# Description: Performs the first three layers (levels 0, 1, and 2) of the NTT.
+#   It works on 8 YMM registers, 8 32-bit coefficients each. Coefficients loaded into
+#   YMM's are separated by 32 coefficients (32 x 4 bytes = 128 bytes). All 8 YMM registers
+#   undergo consecutive butterfly operations with the appropriate "zetas" (twiddle factors) for
+#   each level. This function must be called 4 times with different offsets to process all 256
+#   coefficients.
+#
+# Layer/level details:
+#   - Level 0:  offset = 128, step = 1, uses zeta index 1
+#   - Level 1:  offset =  64, step = 2, uses zeta indexes 2, 3
+#   - Level 2:  offset =  32, step = 4, uses zeta indexes 4, 5, 6, 7
+#
+# Prerequisites:
+#   %rdi    - pointer to the coefficients
+#   %r11    - pointer to the zetas (twiddle factors) table
+#   %ymm14  - register with q_neg_inv (for Montgomery reduction)
+#   %ymm15  - register with modulus Q
+#
+# Arguments:
+#   $off    - offset to the start of a group of 8 coefficients (in bytes, relative to %rdi)
+#             valid values: 0*4, 8*4, 16*4 or 24*4
+#
+# Output:
+#   In-place NTT updated coefficients in memory.
+#
+# Notes:
+#   - Must be invoked 4 times for complete polynomial: offsets 0, 8*4, 16*4, 24*4
+###############################################################################
+
+sub ntt_levels0to2 {
+    my ($off) = @_;
+    $code .= <<___;
+    vmovdqu $off+0*4(%rdi), %ymm0
+    vmovdqu $off+32*4(%rdi), %ymm1
+    vmovdqu $off+64*4(%rdi), %ymm2
+    vmovdqu $off+96*4(%rdi), %ymm3
+    vmovdqu $off+128*4(%rdi), %ymm4
+    vmovdqu $off+160*4(%rdi), %ymm5
+    vmovdqu $off+192*4(%rdi), %ymm6
+    vmovdqu $off+224*4(%rdi), %ymm7
+
+    # ==============================================================
+    # level 0: offset = 128, step = 1
+    # zeta indexes = 1
+
+    vpbroadcastd 1*4(%r11), %ymm13
+___
+    &ntt_butterfly("%ymm0", "%ymm4", "%ymm13", "%ymm0", "%ymm4",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    &ntt_butterfly("%ymm1", "%ymm5", "%ymm13", "%ymm1", "%ymm5",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    &ntt_butterfly("%ymm2", "%ymm6", "%ymm13", "%ymm2", "%ymm6",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    &ntt_butterfly("%ymm3", "%ymm7", "%ymm13", "%ymm3", "%ymm7",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+
+    # ==============================================================
+    # level 1: offset = 64, step = 2
+    # zeta indexes = 2, 3
+
+    vpbroadcastd 2*4(%r11), %ymm13
+___
+    &ntt_butterfly("%ymm0", "%ymm2", "%ymm13", "%ymm0", "%ymm2",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    &ntt_butterfly("%ymm1", "%ymm3", "%ymm13", "%ymm1", "%ymm3",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+    vpbroadcastd 3*4(%r11), %ymm13
+___
+    &ntt_butterfly("%ymm4", "%ymm6", "%ymm13", "%ymm4", "%ymm6",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    &ntt_butterfly("%ymm5", "%ymm7", "%ymm13", "%ymm5", "%ymm7",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+
+    # ==============================================================
+    # level 2: offset = 32, step = 4
+    # zeta indexes = 4, 5, 6, 7
+
+    vpbroadcastd 4*4(%r11), %ymm13
+___
+    &ntt_butterfly("%ymm0", "%ymm1", "%ymm13", "%ymm0", "%ymm1",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+    vpbroadcastd 5*4(%r11), %ymm13
+___
+    &ntt_butterfly("%ymm2", "%ymm3", "%ymm13", "%ymm2", "%ymm3",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+    vpbroadcastd 6*4(%r11), %ymm13
+___
+    &ntt_butterfly("%ymm4", "%ymm5", "%ymm13", "%ymm4", "%ymm5",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+    vpbroadcastd 7*4(%r11), %ymm13
+___
+    &ntt_butterfly("%ymm6", "%ymm7", "%ymm13", "%ymm6", "%ymm7",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm8", "%ymm9", "%ymm10", "%ymm11",            # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+
+    vmovdqu %ymm0, $off+0*4(%rdi)
+    vmovdqu %ymm1, $off+32*4(%rdi)
+    vmovdqu %ymm2, $off+64*4(%rdi)
+    vmovdqu %ymm3, $off+96*4(%rdi)
+    vmovdqu %ymm4, $off+128*4(%rdi)
+    vmovdqu %ymm5, $off+160*4(%rdi)
+    vmovdqu %ymm6, $off+192*4(%rdi)
+    vmovdqu %ymm7, $off+224*4(%rdi)
+___
+}
+
+###############################################################################
+# ntt_levels3to7
+#
+# Description:
+#   Performs layers 3 through 7 of the NTT on 64 coefficients.
+#
+#   It operates on 8 YMM's registers, each YMM packs 8 32-bit coefficients (64
+#   in total).
+#   Contiguous coefficients are loaded into the YMM registers (no gap between them).
+#
+#   The function must be called 4 times to process 256 coefficients.  At each level, the
+#   function executes butterfly operations in the correct coefficient pattern and applies the
+#   corresponding twiddle factors ("zetas").
+#
+# Layer/level details:
+#   - Level 3: offset = 16, step = 8;   zeta indexes 8...15
+#   - Level 4: offset =  8, step = 16;  zeta indexes 16...31
+#   - Level 5: offset =  4, step = 32;  zeta indexes 32...63
+#   - Level 6: offset =  2, step = 64;  zeta indexes 64...127
+#   - Level 7: offset =  1, step = 128; zeta indexes 128...255
+#
+# Prerequisites:
+#   %rdi    - pointer to the coefficients
+#   %r11    - pointer to the zetas (twiddle factors) table
+#   %ymm14  - q_neg_inv (Montgomery reduction)
+#   %ymm15  - Q (modulus)
+#
+# Arguments:
+#   $off, $l3, $l4, $l5, $l6, $l7
+#     $off - offset to the start of a set of 8 coefficients (in bytes, relative to %rdi)
+#     $l3, $l4, $l5, $l6, $l7 - offsets to required zetas in the table, for levels 3 to 7
+#
+# Output:
+#   In-place NTT-transformed coefficients for the selected group.
+#
+# Notes:
+#   - Should be called 4 times per complete 256-coefficient transform (offsets 0, 64*4, 128*4, 192*4).
+#   - All butterfly operations and twiddle applications handled by subroutine ntt_butterfly.
+###############################################################################
+sub ntt_levels3to7 {
+    my ($off,$l3,$l4,$l5,$l6,$l7) = @_;
+
+    $code .= <<___;
+    # ==============================================================
+    # level 3: offset = 16, step = 8
+    # zeta indexes = 8, 9, 10, 11, 12, 13, 14, 15
+
+    # broadcast zetas
+    vpbroadcastd $l3(%r11), %ymm13  # zeta for coefficients 0-15
+
+    # load w_even and w_odd
+    vmovdqu $off(%rdi), %ymm0       # w_even[0:7]
+    vmovdqu $off+32(%rdi), %ymm1    # w_even[8:15]
+    vmovdqu $off+64(%rdi), %ymm2    # w_odd[0:7]
+    vmovdqu $off+96(%rdi), %ymm3    # w_odd[8:15]
+___
+    # Process first 16 coefficients with zeta in ymm13
+    &ntt_butterfly("%ymm0", "%ymm2", "%ymm13", "%ymm0", "%ymm2",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    &ntt_butterfly("%ymm1", "%ymm3", "%ymm13", "%ymm1", "%ymm3",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+    # broadcast zetas
+    vpbroadcastd $l3+4(%r11), %ymm13  # zeta for coefficients 16-31
+
+    # load w_even and w_odd
+    vmovdqu $off+128(%rdi), %ymm4   # w_even[16:23]
+    vmovdqu $off+160(%rdi), %ymm5   # w_even[24:31]
+    vmovdqu $off+192(%rdi), %ymm6  # w_odd[16:23]
+    vmovdqu $off+224(%rdi), %ymm7  # w_odd[24:31]
+___
+    # Process next 16 coefficients with zeta in ymm13
+    &ntt_butterfly("%ymm4", "%ymm6", "%ymm13", "%ymm4", "%ymm6",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+___
+    &ntt_butterfly("%ymm5", "%ymm7", "%ymm13", "%ymm5", "%ymm7",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+
+    # ==============================================================
+    # level 4: offset = 8, step = 16
+    # zeta indexes = 16, 17, 18, ..., 30, 31
+
+    # broadcast zetas for first 8 coefficients
+    vpbroadcastd $l4(%r11), %ymm13      # zeta for coefficients 0-7
+
+    # Prepare w_even and w_odd
+    # Input dword layout:
+    #   ymm0 = [ 0  1  2  3 |  4  5  6  7] (even)
+    #   ymm1 = [ 8  9 10 11 | 12 13 14 15] (even)
+    #   ymm2 = [16 17 18 19 | 20 21 22 23] (odd)
+    #   ymm3 = [24 25 26 27 | 28 29 30 31] (odd)
+    # Required dword layout is the same:
+    #   ymm0 = [ 0  1  2  3 |  4  5  6  7] (even)
+    #   ymm1 = [ 8  9 10 11 | 12 13 14 15] (odd)
+    #   ymm2 = [16 17 18 19 | 20 21 22 23] (even)
+    #   ymm3 = [24 25 26 27 | 28 29 30 31] (odd)
+
+___
+    &ntt_butterfly("%ymm0", "%ymm1", "%ymm13", "%ymm0", "%ymm1",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+    # broadcast zetas for next 8 coefficients
+    vpbroadcastd $l4+4(%r11), %ymm13    # zeta for coefficients 8-15
+___
+    &ntt_butterfly("%ymm2", "%ymm3", "%ymm13", "%ymm2", "%ymm3",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+    # broadcast zetas for next 8 coefficients
+    vpbroadcastd $l4+8(%r11), %ymm13    # zeta for coefficients 16-23
+___
+    &ntt_butterfly("%ymm4", "%ymm5", "%ymm13", "%ymm4", "%ymm5",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+    # broadcast zetas for next 8 coefficients
+    vpbroadcastd $l4+12(%r11), %ymm13   # zeta for coefficients 24-31
+___
+    &ntt_butterfly("%ymm6", "%ymm7", "%ymm13", "%ymm6", "%ymm7",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+
+    # ==============================================================
+    # level 5: offset = 4, step = 32
+    # zeta indexes = 32, 33, 34, ..., 62, 63
+
+    # load zetas for first 8 coefficients
+    vpbroadcastd $l5(%r11), %ymm13         # zetas for coefficients 0-3
+    vpbroadcastd $l5+4(%r11), %ymm12       # zetas for coefficients 4-7
+    vpblendd \$0xf0, %ymm12, %ymm13, %ymm13 # blend into ymm13
+
+    # prepare w_even and w_odd
+    # Input dword layout:
+    #   ymm0 = [ 0  1  2  3 |  4  5  6  7] (even)
+    #   ymm1 = [ 8  9 10 11 | 12 13 14 15] (odd)
+    # Required dword layout:
+    #   ymm8 = [ 0  1  2  3 |  8  9 10 11] (even)
+    #   ymm1 = [ 4  5  6  7 | 12 13 14 15] (odd)
+    vperm2i128 \$0x20, %ymm1, %ymm0, %ymm8
+    vperm2i128 \$0x31, %ymm1, %ymm0, %ymm1
+
+___
+    &ntt_butterfly("%ymm8", "%ymm1", "%ymm13", "%ymm0", "%ymm1",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+    # load zetas for first 8 coefficients
+    vpbroadcastd $l5+8(%r11), %ymm13       # zetas for coefficients 8-11
+    vpbroadcastd $l5+12(%r11), %ymm12       # zetas for coefficients 12-15
+    vpblendd \$0xf0, %ymm12, %ymm13, %ymm13 # blend into ymm13
+
+    # prepare w_even and w_odd
+    vperm2i128 \$0x20, %ymm3, %ymm2, %ymm8
+    vperm2i128 \$0x31, %ymm3, %ymm2, %ymm3
+___
+    &ntt_butterfly("%ymm8", "%ymm3", "%ymm13", "%ymm2", "%ymm3",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+    # load zetas for next 8 coefficients
+    vpbroadcastd $l5+16(%r11), %ymm13       # zetas for coefficients 16-19
+    vpbroadcastd $l5+20(%r11), %ymm12       # zetas for coefficients 20-23
+    vpblendd \$0xf0, %ymm12, %ymm13, %ymm13 # blend into ymm13
+
+    # prepare w_even and w_odd
+    vperm2i128 \$0x20, %ymm5, %ymm4, %ymm8
+    vperm2i128 \$0x31, %ymm5, %ymm4, %ymm5
+___
+    &ntt_butterfly("%ymm8", "%ymm5", "%ymm13", "%ymm4", "%ymm5",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+    # load zetas for next 8 coefficients
+    vpbroadcastd $l5+24(%r11), %ymm13       # zetas for coefficients 24-27
+    vpbroadcastd $l5+28(%r11), %ymm12       # zetas for coefficients 28-31
+    vpblendd \$0xf0, %ymm12, %ymm13, %ymm13 # blend into ymm13
+
+    # prepare w_even and w_odd
+    vperm2i128 \$0x20, %ymm7, %ymm6, %ymm8
+    vperm2i128 \$0x31, %ymm7, %ymm6, %ymm7
+___
+    &ntt_butterfly("%ymm8", "%ymm7", "%ymm13", "%ymm6", "%ymm7",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+
+    # ==============================================================
+    # level 6: offset = 2, step = 64
+    # zeta indexes = 64, 65, 66, ..., 62, 127
+
+    # Input DWORD layout in memory:
+    #   ymm0 = [ 0  1  2  3] [ 8  9 10 11] (even)
+    #   ymm1 = [ 4  5  6  7] [12 13 14 15] (odd)
+    #   ymm2 = [16 17 18 19] [24 25 26 27] (even)
+    #   ymm3 = [20 21 22 23] [28 29 30 31] (odd)
+    # Desired DWORD layout:
+    #   ymm8 = [ 0  1  4  5] [ 8  9 12 13] (even)
+    #   ymm1 = [ 2  3  6  7] [10 11 14 15] (odd)
+
+    # load & prepare zetas for first 16 coefficients
+    vmovdqu $l6(%r11), %ymm10
+    vpshufd \$0xfa, %ymm10, %ymm11
+    vpshufd \$0x50, %ymm10, %ymm10
+    vperm2i128 \$0x20, %ymm11, %ymm10, %ymm13
+    vperm2i128 \$0x31, %ymm11, %ymm10, %ymm12
+    vmovdqu %ymm12, (%rsp)
+
+    # prepare w_even and w_odd
+    vpunpcklqdq %ymm1, %ymm0, %ymm8
+    vpunpckhqdq %ymm1, %ymm0, %ymm1
+___
+    &ntt_butterfly("%ymm8", "%ymm1", "%ymm13", "%ymm0", "%ymm1",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+    # load formatted zetas from the stack
+    vmovdqu (%rsp), %ymm13
+
+    # prepare w_even and w_odd
+    vpunpcklqdq %ymm3, %ymm2, %ymm8
+    vpunpckhqdq %ymm3, %ymm2, %ymm3
+___
+    &ntt_butterfly("%ymm8", "%ymm3", "%ymm13", "%ymm2", "%ymm3",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+
+    # load & prepare zetas for next 16 coefficients
+    vmovdqu $l6+32(%r11), %ymm10
+    vpshufd \$0xfa, %ymm10, %ymm11
+    vpshufd \$0x50, %ymm10, %ymm10
+    vperm2i128 \$0x20, %ymm11, %ymm10, %ymm13
+    vperm2i128 \$0x31, %ymm11, %ymm10, %ymm12
+    vmovdqu %ymm12, (%rsp)
+
+    # prepare w_even and w_odd
+    vpunpcklqdq %ymm5, %ymm4, %ymm8
+    vpunpckhqdq %ymm5, %ymm4, %ymm5
+___
+    &ntt_butterfly("%ymm8", "%ymm5", "%ymm13", "%ymm4", "%ymm5",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+    # load formatted zetas from the stack
+    vmovdqu (%rsp), %ymm13
+
+    # prepare w_even and w_odd
+    vpunpcklqdq %ymm7, %ymm6, %ymm8
+    vpunpckhqdq %ymm7, %ymm6, %ymm7
+___
+    &ntt_butterfly("%ymm8", "%ymm7", "%ymm13", "%ymm6", "%ymm7",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+
+    # ==============================================================
+    # level 7: offset = 1, step = 128
+    # zeta indexes = 128, 129, 130, ..., 254, 255
+
+    # Input DWORD layout:
+    #   ymm0 = [ 0  1  4  5] [ 8  9 12 13]
+    #   ymm1 = [ 2  3  6  7] [10 11 14 15]
+    # Required DWORD layout:
+    #   ymm8 = [ 0  2  4  6] [ 8 10 12 14] (even)
+    #   ymm1 = [ 1  3  5  7] [ 9 11 13 15] (odd)
+
+    vpshufd \$0x88, %ymm0, %ymm8
+    vpshufd \$0x88, %ymm1, %ymm9
+    vpshufd \$0xDD, %ymm0, %ymm10
+    vpshufd \$0xDD, %ymm1, %ymm11
+    vpunpckldq %ymm9, %ymm8, %ymm0
+    vpunpckldq %ymm11, %ymm10, %ymm1
+
+    # load zetas
+    vmovdqu $l7(%r11), %ymm13          # 8 zetas for coefficients 0-7
+___
+    &ntt_butterfly("%ymm0", "%ymm1", "%ymm13", "%ymm0", "%ymm1",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+    # load zetas
+    vmovdqu $l7+32(%r11), %ymm13       # 8 zetas for coefficients 8-15
+
+    # format w_even and w_odd
+    vpshufd \$0x88, %ymm2, %ymm8
+    vpshufd \$0x88, %ymm3, %ymm9
+    vpshufd \$0xDD, %ymm2, %ymm10
+    vpshufd \$0xDD, %ymm3, %ymm11
+    vpunpckldq %ymm9, %ymm8, %ymm2
+    vpunpckldq %ymm11, %ymm10, %ymm3
+___
+    &ntt_butterfly("%ymm2", "%ymm3", "%ymm13", "%ymm2", "%ymm3",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+
+    # load zetas
+    vmovdqu $l7+64(%r11), %ymm13       # 8 zetas for coefficients 16-23
+
+    # format w_even and w_odd
+    vpshufd \$0x88, %ymm4, %ymm8
+    vpshufd \$0x88, %ymm5, %ymm9
+    vpshufd \$0xDD, %ymm4, %ymm10
+    vpshufd \$0xDD, %ymm5, %ymm11
+    vpunpckldq %ymm9, %ymm8, %ymm4
+    vpunpckldq %ymm11, %ymm10, %ymm5
+___
+    &ntt_butterfly("%ymm4", "%ymm5", "%ymm13", "%ymm4", "%ymm5",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+$code .= <<___;
+    # load zetas
+    vmovdqu $l7+96(%r11), %ymm13       # 8 zetas for coefficients 24-31
+
+    # format w_even and w_odd
+    vpshufd \$0x88, %ymm6, %ymm8
+    vpshufd \$0x88, %ymm7, %ymm9
+    vpshufd \$0xDD, %ymm6, %ymm10
+    vpshufd \$0xDD, %ymm7, %ymm11
+    vpunpckldq %ymm9, %ymm8, %ymm6
+    vpunpckldq %ymm11, %ymm10, %ymm7
+___
+    &ntt_butterfly("%ymm6", "%ymm7", "%ymm13", "%ymm6", "%ymm7",    # w_even, w_odd, zetas, n_even, n_odd
+                   "%ymm9", "%ymm10", "%ymm11", "%ymm12",           # tmp
+                   "%ymm14", "%ymm15");                             # qinv, q
+    $code .= <<___;
+
+    # Interleave and store first 16
+    vpunpckldq %ymm1, %ymm0, %ymm8
+    vpunpckhdq %ymm1, %ymm0, %ymm9
+
+    vperm2i128 \$0x20, %ymm9, %ymm8, %ymm10
+    vperm2i128 \$0x31, %ymm9, %ymm8, %ymm11
+    vmovdqu %ymm10, $off(%rdi)
+    vmovdqu %ymm11, $off+32(%rdi)
+
+    vpunpckldq  %ymm3, %ymm2, %ymm8
+    vpunpckhdq  %ymm3, %ymm2, %ymm9
+
+    vperm2i128 \$0x20, %ymm9, %ymm8, %ymm10
+    vperm2i128 \$0x31, %ymm9, %ymm8, %ymm11
+    vmovdqu %ymm10, $off+64(%rdi)
+    vmovdqu %ymm11, $off+96(%rdi)
+
+    # Interleave and store second 16
+    vpunpckldq %ymm5, %ymm4, %ymm8
+    vpunpckhdq %ymm5, %ymm4, %ymm9
+
+    vperm2i128 \$0x20, %ymm9, %ymm8, %ymm10
+    vperm2i128 \$0x31, %ymm9, %ymm8, %ymm11
+    vmovdqu %ymm10, $off+128(%rdi)
+    vmovdqu %ymm11, $off+160(%rdi)
+
+    vpunpckldq  %ymm7, %ymm6, %ymm8
+    vpunpckhdq  %ymm7, %ymm6, %ymm9
+
+    vperm2i128 \$0x20, %ymm9, %ymm8, %ymm10
+    vperm2i128 \$0x31, %ymm9, %ymm8, %ymm11
+    vmovdqu %ymm10, $off+192(%rdi)
+    vmovdqu %ymm11, $off+224(%rdi)
+___
+}
+
+###############################################################################
+###############################################################################
+###
+### INTT (Inverse Number Theoretic Transform)
+###
+###############################################################################
+###############################################################################
+
+###############################################################################
+# intt_butterfly
+#
+# Description:
+#   Performs the butterfly operation for a single stage of the INTT
+#   on two 8-element vectors of 32-bit integers (YMM).
+#
+#   Implements the core data-mixing and modular multiplication steps with a
+#   zeta (twiddle factor), including the modular reductions. The updated even and
+#   odd results are put into the designated registers.
+#
+#   This butterfly operation computes:
+#     n_even = (w_even + w_odd) mod Q
+#     n_odd  = ((w_even + Q) - w_odd) * zetas mod Q
+#   where Q is the NTT modulus, and zetas is the power of a primitive root used
+#   for this stage.
+#
+# Parameters:
+#   w_even    - YMM with "even" input coefficients
+#   w_odd     - YMM with "odd" input coefficients
+#   zetas     - YMM with modular twiddle factors for this butterfly
+#   tmp0-tmp2 - Scratch YMM registers for temporary/intermediate values
+#   n_even    - Output YMM for new even coefficients
+#   n_odd     - Output YMM for new odd coefficients
+#   q_neg_inv - YMM with -Q^{-1} mod 2^32 for Montgomery reduction
+#   q         - YMM with modulus Q
+#
+# Output:
+#   n_even, n_odd
+#
+# Side effects:
+#   Clobbers tmp0, tmp1, tmp2
+###############################################################################
+
+sub intt_butterfly {
+    my ($w_even, $w_odd, $zetas,
+        $tmp0, $tmp1, $tmp2,
+        $n_even, $n_odd,
+        $q_neg_inv, $q) = @_;
+
+$code .= <<___;
+    # n_even = reduce_once(w_even + w_odd)
+    # n_odd = (w_even + Q) - w_odd
+    vpaddd $q, $w_even, $tmp1           # n_odd: tmp1 = w_even + Q
+    vpaddd $w_even, $w_odd, $n_even     # n_even: n_even = w_even + w_odd
+    vpsubd $w_odd, $tmp1, $n_odd        # n_odd: n_odd = tmp1 - w_odd
+
+    vpcmpgtd $n_even, $q, $tmp0         # n_even: tmp0 = $q > $n_even ? 0xffffffff : 0
+    vpandn $q, $tmp0, $tmp0             # n_even: tmp0 = ~$tmp0 & $q
+    vpsubd $tmp0, $n_even, $n_even      # n_even: n_even -= $tmp0
+
+    # Multiply n_odd by zetas (step root)
+___
+    &multiply_8x8_mod_Q($n_odd,     # A
+                        $zetas,     # B
+                        $n_odd,     # out (AxB)
+                        $tmp0, $tmp1, $tmp2,    # tmp
+                        $q_neg_inv, $q);        # qinv, q
+}
+
+###############################################################################
+# intt_levels0to4
+#
+# Description:
+#   Executes the first five stages (levels 0–4) of the INTT
+#   on groups of 32 coefficients (4 YMM registers).
+#
+#   This function hierarchically mixes and transforms groups of coefficients using
+#   butterfly operations and level specific zeta (twiddle) factors, performing all required
+#   re-packing and permutations for each layer.
+#
+#   Each call operates on a block of 64 coefficients, and must be repeated 4 times (with
+#   offsets 0, 64*4, 128*4, 192*4) to process all 256 coefficients.
+#
+# Layer/Level details:
+#   - Level 0: offset = 1,   step = 128;  zeta indexes (new) = 0..127
+#   - Level 1: offset = 2,   step = 64;   zeta indexes (new) = 128..191
+#   - Level 2: offset = 4,   step = 32;   zeta indexes (new) = 192..223
+#   - Level 3: offset = 8,   step = 16;   zeta indexes (new) = 224..239
+#   - Level 4: offset = 16,  step = 8;    zeta indexes (new) = 240..247
+#
+# Prerequisites:
+#   %rdi    - pointer to the coefficients array
+#   %r11    - pointer to the zetas (twiddle factors) table
+#   %ymm14  - q_neg_inv (for Montgomery reduction)
+#   %ymm15  - Q (modulus)
+#
+# Arguments:
+#   $off    - offset (in bytes) to the start of the 16-coefficient group
+#   $l0-$l4 - offsets (in bytes) into the zeta table for each INTT level 0..4
+#
+# Output:
+#   Updated coefficients are written in-place in memory.
+#
+# Notes:
+#   - Function must be called 4 times for a full 256-coefficient INTT layer sweep.
+###############################################################################
+
+sub intt_levels0to4 {
+    my ($off,$l0,$l1,$l2,$l3,$l4) = @_;
+    $code .= <<___;
+    # ==============================================================
+    # level 0: offset = 1, step = 128
+    # zeta indexes (original table) = 255, 254, 253, ... 129, 128
+    # zeta indexes (new table) = 0, 1, 2, .. 127
+
+    # dword layout in memory:
+    #   ymm0 = [0,1,2,3 | 4,5,6,7]
+    #   ymm1 = [8,9,10,11 | 12,13,14,15]
+    # required dword layout in registers:
+    #   ymm0 = [0,2,4,6 | 8,10,12,14]
+    #   ymm1 = [1,3,5,7 | 9,11,13,15]
+
+    # load w_even and w_odd
+    vmovdqu $off(%rdi), %ymm8
+    vmovdqu $off+32(%rdi), %ymm9
+
+    # compact even words into ymm0 (w_even[0:7])
+    vmovdqa idx_even(%rip), %ymm13
+    vpermd %ymm8, %ymm13, %ymm10            # [ 0, 2, 4, 6 | 0, 2, 4, 6]
+    vpermd %ymm9, %ymm13, %ymm11            # [ 8,10,12,14 | 8,10,12,14]
+    vpblendd \$0xf0, %ymm11, %ymm10, %ymm0  # [ 0, 2, 4, 6 | 8,10,12,14]
+
+    # compact odd words into ymm1 (w_odd[0..7])
+    vmovdqa idx_odd(%rip), %ymm13
+    vpermd %ymm8, %ymm13, %ymm10            # [ 1, 3, 5, 7 | 1, 3, 5, 7]
+    vpermd %ymm9, %ymm13, %ymm11            # [ 9,11,13,15 | 9,11,13,15]
+    vpblendd \$0xf0, %ymm11, %ymm10, %ymm1  # [ 1, 3, 5, 7 | 9,11,13,15]
+
+    # load 16 zetas
+    vmovdqu $l0(%r11), %ymm13
+
+___
+
+    &intt_butterfly("%ymm0", "%ymm1", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm0", "%ymm1",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # load w_even and w_odd
+    vmovdqu $off+64(%rdi), %ymm8
+    vmovdqu $off+96(%rdi), %ymm9
+
+    # compact even words into ymm2 (w_even[8..15])
+    vmovdqa idx_even(%rip), %ymm13
+    vpermd %ymm8, %ymm13, %ymm10
+    vpermd %ymm9, %ymm13, %ymm11
+    vpblendd \$0xf0, %ymm11, %ymm10, %ymm2
+
+    # compact odd words into ymm3 (w_odd[8..15])
+    vmovdqa idx_odd(%rip), %ymm13
+    vpermd %ymm8, %ymm13, %ymm10
+    vpermd %ymm9, %ymm13, %ymm11
+    vpblendd \$0xf0, %ymm11, %ymm10, %ymm3
+
+    # load 16 zetas
+    vmovdqu $l0+32(%r11), %ymm13
+
+___
+
+    &intt_butterfly("%ymm2", "%ymm3", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm2", "%ymm3",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+    # load w_even and w_odd
+    vmovdqu $off+128(%rdi), %ymm8
+    vmovdqu $off+160(%rdi), %ymm9
+
+    # compact even words into ymm4 (w_even[16..23])
+    vmovdqa idx_even(%rip), %ymm13
+    vpermd %ymm8, %ymm13, %ymm10            # [ 0, 2, 4, 6 | 0, 2, 4, 6]
+    vpermd %ymm9, %ymm13, %ymm11            # [ 8,10,12,14 | 8,10,12,14]
+    vpblendd \$0xf0, %ymm11, %ymm10, %ymm4  # [ 0, 2, 4, 6 | 8,10,12,14]
+
+    # compact odd words into ymm5 (w_odd[16..23])
+    vmovdqa idx_odd(%rip), %ymm13
+    vpermd %ymm8, %ymm13, %ymm10            # [ 1, 3, 5, 7 | 1, 3, 5, 7]
+    vpermd %ymm9, %ymm13, %ymm11            # [ 9,11,13,15 | 9,11,13,15]
+    vpblendd \$0xf0, %ymm11, %ymm10, %ymm5  # [ 1, 3, 5, 7 | 9,11,13,15]
+
+    # load 16 zetas
+    vmovdqu $l0+64(%r11), %ymm13
+
+___
+
+    &intt_butterfly("%ymm4", "%ymm5", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm4", "%ymm5",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # load w_even and w_odd
+    vmovdqu $off+192(%rdi), %ymm8
+    vmovdqu $off+224(%rdi), %ymm9
+
+    # compact even words into ymm6 (w_even[24..31])
+    vmovdqa idx_even(%rip), %ymm13
+    vpermd %ymm8, %ymm13, %ymm10
+    vpermd %ymm9, %ymm13, %ymm11
+    vpblendd \$0xf0, %ymm11, %ymm10, %ymm6
+
+    # compact odd words into ymm7 (w_odd[24..31])
+    vmovdqa idx_odd(%rip), %ymm13
+    vpermd %ymm8, %ymm13, %ymm10
+    vpermd %ymm9, %ymm13, %ymm11
+    vpblendd \$0xf0, %ymm11, %ymm10, %ymm7
+
+    # load 16 zetas
+    vmovdqu $l0+96(%r11), %ymm13
+
+___
+
+    &intt_butterfly("%ymm6", "%ymm7", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm6", "%ymm7",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # ==============================================================
+    # level 1: offset = 2, step = 64
+    # zeta indexes = 127, 126, 125, ... 65, 64
+    # zeta indexes (new) = 128, 129, .. 191
+
+    # Result DWORD layout:
+    #   ymm0 = [0,  2,  4,  6,  8, 10, 12, 14]
+    #   ymm1 = [1,  3,  5,  7,  9, 11, 13, 15]
+    # Desired layout for this phase:
+    #   %ymm0 = [0,1,4,5,8,9,12,13]
+    #   %ymm1 = [2,3,6,7,10,11,14,15]
+
+    # Interleave even/odd within each 128-bit lane:
+    vpunpckldq %ymm1, %ymm0, %ymm8     # A = [0,1,2,3 | 8,9,10,11]
+    vpunpckhdq %ymm1, %ymm0, %ymm9     # B = [4,5,6,7 | 12,13,14,15]
+
+    # [0,1,4,5 | 8,9,12,13]
+    vshufps \$0x44, %ymm9, %ymm8, %ymm0
+
+    # [2,3,6,7 | 10,11,14,15]
+    vshufps \$0xee, %ymm9, %ymm8, %ymm1
+
+    # load 4 zetas and populate across ymm
+    vmovdqu $l1(%r11), %xmm13               # [0 1 2 3]
+    vpmovzxdq %xmm13, %ymm13
+    vmovsldup %ymm13, %ymm13                # [0 0 1 1] [2 2 3 3]
+___
+
+    &intt_butterfly("%ymm0", "%ymm1", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm0", "%ymm1",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # Interleave even/odd within each 128-bit lane:
+    vpunpckldq %ymm3, %ymm2, %ymm8     # A = [0,1,2,3 | 8,9,10,11]
+    vpunpckhdq %ymm3, %ymm2, %ymm9     # B = [4,5,6,7 | 12,13,14,15]
+
+    # ymm2 = [0,1,4,5 | 8,9,12,13]
+    vshufps \$0x44, %ymm9, %ymm8, %ymm2
+
+    # ymm3 = [2,3,6,7 | 10,11,14,15]
+    vshufps \$0xee, %ymm9, %ymm8, %ymm3
+
+    # load 4 zetas and populate across YMM
+    vmovdqu $l1+16(%r11), %xmm13            # [0 1 2 3]
+    vpmovzxdq %xmm13, %ymm13
+    vmovsldup %ymm13, %ymm13                # [0 0 1 1] [2 2 3 3]
+___
+
+    &intt_butterfly("%ymm2", "%ymm3", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm2", "%ymm3",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+    # Interleave even/odd within each 128-bit lane:
+    vpunpckldq %ymm5, %ymm4, %ymm8     # A = [0,1,2,3 | 8,9,10,11]
+    vpunpckhdq %ymm5, %ymm4, %ymm9     # B = [4,5,6,7 | 12,13,14,15]
+
+    # [0,1,4,5 | 8,9,12,13]
+    vshufps \$0x44, %ymm9, %ymm8, %ymm4
+
+    # [2,3,6,7 | 10,11,14,15]
+    vshufps \$0xee, %ymm9, %ymm8, %ymm5
+
+    # load 4 zetas and populate across ymm
+    vmovdqu $l1+32(%r11), %xmm13             # [0 1 2 3]
+    vpmovzxdq %xmm13, %ymm13
+    vmovsldup %ymm13, %ymm13                # [0 0 1 1] [2 2 3 3]
+___
+
+    &intt_butterfly("%ymm4", "%ymm5", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm4", "%ymm5",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # Interleave even/odd within each 128-bit lane:
+    vpunpckldq %ymm7, %ymm6, %ymm8     # A = [0,1,2,3 | 8,9,10,11]
+    vpunpckhdq %ymm7, %ymm6, %ymm9     # B = [4,5,6,7 | 12,13,14,15]
+
+    # ymm6 = [0,1,4,5 | 8,9,12,13]
+    vshufps \$0x44, %ymm9, %ymm8, %ymm6
+
+    # ymm7 = [2,3,6,7 | 10,11,14,15]
+    vshufps \$0xee, %ymm9, %ymm8, %ymm7
+
+    # load 4 zetas and populate across YMM
+    vmovdqu $l1+48(%r11), %xmm13            # [0 1 2 3]
+    vpmovzxdq %xmm13, %ymm13
+    vmovsldup %ymm13, %ymm13                # [0 0 1 1] [2 2 3 3]
+___
+
+    &intt_butterfly("%ymm6", "%ymm7", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm6", "%ymm7",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # ==============================================================
+    # level 2: offset = 4, step = 32
+    # zeta indexes = 63, 62, 61, ... 33, 32
+    # zeta indexes (new) = 192, 193, 194, ... 223
+
+    # Result DWORD layout:
+    #   ymm0 = [0,1,4,5,8,9,12,13]
+    #   ymm1 = [16,17,20,21,24,25,28,29]
+    # Desired layout:
+    #   ymm8  = [0,1,2,3, 8,9,10,11]
+    #   ymm1 = [4,5,6,7, 12,13,14,15]
+
+    # ymm8 = [0,1,2,3 | 8,9,10,11]
+    vshufps \$0x44, %ymm1, %ymm0, %ymm8
+
+    # ymm1 = [4,5,6,7 | 12,13,14,15]
+    vshufps \$0xee, %ymm1, %ymm0, %ymm1
+
+    # broadcast zetas
+    vpbroadcastd $l2(%r11), %ymm13
+    vpbroadcastd $l2+4(%r11), %ymm12
+    vpblendd \$0xf0, %ymm12, %ymm13, %ymm13
+___
+
+    &intt_butterfly("%ymm8", "%ymm1", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm0", "%ymm1",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+    vshufps \$0x44, %ymm3, %ymm2, %ymm8
+    vshufps \$0xee, %ymm3, %ymm2, %ymm3
+
+    # broadcast zetas
+    vpbroadcastd $l2+8(%r11), %ymm13
+    vpbroadcastd $l2+12(%r11), %ymm12
+    vpblendd \$0xf0, %ymm12, %ymm13, %ymm13
+___
+
+    &intt_butterfly("%ymm8", "%ymm3", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm2", "%ymm3",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+    vshufps \$0x44, %ymm5, %ymm4, %ymm8
+    vshufps \$0xee, %ymm5, %ymm4, %ymm5
+
+    # broadcast zetas
+    vpbroadcastd $l2+16(%r11), %ymm13
+    vpbroadcastd $l2+20(%r11), %ymm12
+    vpblendd \$0xf0, %ymm12, %ymm13, %ymm13
+___
+
+    &intt_butterfly("%ymm8", "%ymm5", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm4", "%ymm5",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+    vshufps \$0x44, %ymm7, %ymm6, %ymm8
+    vshufps \$0xee, %ymm7, %ymm6, %ymm7
+
+    # broadcast zetas
+    vpbroadcastd $l2+24(%r11), %ymm13
+    vpbroadcastd $l2+28(%r11), %ymm12
+    vpblendd \$0xf0, %ymm12, %ymm13, %ymm13
+___
+
+    &intt_butterfly("%ymm8", "%ymm7", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm6", "%ymm7",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # ==============================================================
+    # level 3: offset = 8, step = 16
+    # zeta indexes = 31, 30, 29, ... 17, 16
+    # zeta indexes (new) = 224, 225, 226, ... 239
+
+    #   ymm0 = [0,1,2,3 | 8,9,10,11]
+    #   ymm1 = [16,17,18,19 | 24,25,26,27]
+    vperm2i128 \$0x20, %ymm1, %ymm0, %ymm8      # [0,1,2,3 | 4,5,6,7]
+    vperm2i128 \$0x31, %ymm1, %ymm0, %ymm1      # [8,9,10,11 | 12,13,14,15]
+
+    # broadcast zetas
+    vpbroadcastd $l3(%r11), %ymm13
+___
+
+    &intt_butterfly("%ymm8", "%ymm1", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm0", "%ymm1",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    vperm2i128 \$0x20, %ymm3, %ymm2, %ymm8
+    vperm2i128 \$0x31, %ymm3, %ymm2, %ymm3
+
+    # broadcast zetas
+    vpbroadcastd $l3+4(%r11), %ymm13
+___
+
+    &intt_butterfly("%ymm8", "%ymm3", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm2", "%ymm3",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+    vperm2i128 \$0x20, %ymm5, %ymm4, %ymm8      # [0,1,2,3 | 4,5,6,7]
+    vperm2i128 \$0x31, %ymm5, %ymm4, %ymm5      # [8,9,10,11 | 12,13,14,15]
+
+    # broadcast zetas
+    vpbroadcastd $l3+8(%r11), %ymm13
+___
+
+    &intt_butterfly("%ymm8", "%ymm5", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm4", "%ymm5",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    vperm2i128 \$0x20, %ymm7, %ymm6, %ymm8
+    vperm2i128 \$0x31, %ymm7, %ymm6, %ymm7
+
+    # broadcast zetas
+    vpbroadcastd $l3+12(%r11), %ymm13
+___
+
+    &intt_butterfly("%ymm8", "%ymm7", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm6", "%ymm7",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # ==============================================================
+    # level 4: offset = 16, step = 8
+    # zeta indexes = 15, 14, 13, ..., 9, 8
+    # zeta indexes (new) = 240, 241, 242, ... 247
+
+    # broadcast zetas
+    vpbroadcastd $l4(%r11), %ymm13
+___
+
+    &intt_butterfly("%ymm0", "%ymm2", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm0", "%ymm2",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+    &intt_butterfly("%ymm1", "%ymm3", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm1", "%ymm3",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+    # broadcast zetas
+    vpbroadcastd $l4+4(%r11), %ymm13
+___
+
+    &intt_butterfly("%ymm4", "%ymm6", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm4", "%ymm6",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+    &intt_butterfly("%ymm5", "%ymm7", "%ymm13",     # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12",   # temporary
+                    "%ymm5", "%ymm7",               # n_even, n_odd
+                    "%ymm14", "%ymm15");            # qinv, q32
+
+$code .= <<___;
+
+    # Store new w_even and w_odd
+    vmovdqu %ymm0, $off(%rdi)
+    vmovdqu %ymm1, $off+32(%rdi)
+    vmovdqu %ymm2, $off+64(%rdi)
+    vmovdqu %ymm3, $off+96(%rdi)
+    vmovdqu %ymm4, $off+128(%rdi)
+    vmovdqu %ymm5, $off+160(%rdi)
+    vmovdqu %ymm6, $off+192(%rdi)
+    vmovdqu %ymm7, $off+224(%rdi)
+___
+}
+
+###############################################################################
+# intt_levels5to7
+#
+# Description:
+#   Processes the last three levels (5 to 7) of the INTT on 64 coefficients.
+#
+#   It completes the hierarchical merging of INTT, applying stage-specific zeta
+#   (twiddle) factors and modular butterfly operations for each level, and performs the final
+#   post-processing Montgomery multiplication at the end of the transform.
+#
+#   It must be invoked 4 times with offsets equal to 0*4, 8*4, 16*4 and 24*4 to cover
+#   all 256 coefficients.
+#
+# Layer/Level details:
+#   - Level 5: offset =  32, step = 4;   zeta indexes (new) = 248..251
+#   - Level 6: offset =  64, step = 2;   zeta indexes (new) = 252, 253
+#   - Level 7: offset = 128, step = 1;   zeta index   (new) = 254
+#
+#   After all INTT levels, multiplies the output by the Montgomery factor
+#   corresponding to the inverse transform scaling (usually the modular inverse of the
+#   NTT degree in Montgomery form), to obtain the final reduced coefficients.
+#
+# Prerequisites:
+#   %rdi    - pointer to the coefficients array
+#   %r11    - pointer to the zetas (twiddle factors) table
+#   %ymm14  - q_neg_inv (for Montgomery reduction)
+#   %ymm15  - Q (modulus)
+#
+# Arguments:
+#   $off    - offset (in bytes) to the start of the 8-coefficient group
+#
+# Output:
+#   Overwrites memory at the given offset with the INTT-processed coefficients.
+#
+# Notes:
+#   - This subroutine must be called 4 times with appropriate offsets to process
+#     all 256 coefficients.
+###############################################################################
+
+sub intt_levels5to7 {
+    my ($off) = @_;
+    $code .= <<___;
+    vmovdqu $off+0*4(%rdi), %ymm0
+    vmovdqu $off+32*4(%rdi), %ymm1
+    vmovdqu $off+64*4(%rdi), %ymm2
+    vmovdqu $off+96*4(%rdi), %ymm3
+    vmovdqu $off+128*4(%rdi), %ymm4
+    vmovdqu $off+160*4(%rdi), %ymm5
+    vmovdqu $off+192*4(%rdi), %ymm6
+    vmovdqu $off+224*4(%rdi), %ymm7
+
+    # ==============================================================
+    # level 5: offset = 32, step = 4
+
+    vpbroadcastd 248*4(%r11), %ymm13
+___
+    &intt_butterfly("%ymm0", "%ymm1", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm0", "%ymm1",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+    $code .= <<___;
+    vpbroadcastd 249*4(%r11), %ymm13
+___
+    &intt_butterfly("%ymm2", "%ymm3", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm2", "%ymm3",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+
+    $code .= <<___;
+    vpbroadcastd 250*4(%r11), %ymm13
+___
+    &intt_butterfly("%ymm4", "%ymm5", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm4", "%ymm5",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+
+    $code .= <<___;
+    vpbroadcastd 251*4(%r11), %ymm13
+___
+    &intt_butterfly("%ymm6", "%ymm7", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm6", "%ymm7",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+    $code .= <<___;
+
+    # ==============================================================
+    # level 6: offset = 64, step = 2
+
+    vpbroadcastd 252*4(%r11), %ymm13
+___
+    &intt_butterfly("%ymm0", "%ymm2", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm0", "%ymm2",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+    &intt_butterfly("%ymm1", "%ymm3", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm1", "%ymm3",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+    $code .= <<___;
+    vpbroadcastd 253*4(%r11), %ymm13
+___
+    &intt_butterfly("%ymm4", "%ymm6", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm4", "%ymm6",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+    &intt_butterfly("%ymm5", "%ymm7", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm5", "%ymm7",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+$code .= <<___;
+
+    # ==============================================================
+    # level 7: offset = 128, step = 1
+
+    vpbroadcastd 254*4(%r11), %ymm13
+___
+    &intt_butterfly("%ymm0", "%ymm4", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm0", "%ymm4",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+    &intt_butterfly("%ymm1", "%ymm5", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm1", "%ymm5",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+    &intt_butterfly("%ymm2", "%ymm6", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm2", "%ymm6",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+    &intt_butterfly("%ymm3", "%ymm7", "%ymm13", # w_even, w_odd, zetas
+                    "%ymm10", "%ymm11", "%ymm12", # temporary
+                    "%ymm3", "%ymm7",           # n_even, n_odd
+                    "%ymm14", "%ymm15");        # qinv, q32
+$code .= <<___;
+
+    # ==============================================================
+    # extra multiply
+
+    vpbroadcastd ml_dsa_inverse_degree_montgomery(%rip), %ymm13
+___
+
+    &multiply_8x8_mod_Q("%ymm0", "%ymm13", "%ymm0", # A, B, out (AxB)
+                        "%ymm10", "%ymm11", "%ymm12", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+    &multiply_8x8_mod_Q("%ymm4", "%ymm13", "%ymm4", # A, B, out (AxB)
+                        "%ymm10", "%ymm11", "%ymm12", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+    &multiply_8x8_mod_Q("%ymm1", "%ymm13", "%ymm1", # A, B, out (AxB)
+                        "%ymm10", "%ymm11", "%ymm12", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+    &multiply_8x8_mod_Q("%ymm5", "%ymm13", "%ymm5", # A, B, out (AxB)
+                        "%ymm10", "%ymm11", "%ymm12", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+    &multiply_8x8_mod_Q("%ymm2", "%ymm13", "%ymm2", # A, B, out (AxB)
+                        "%ymm10", "%ymm11", "%ymm12", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+    &multiply_8x8_mod_Q("%ymm6", "%ymm13", "%ymm6", # A, B, out (AxB)
+                        "%ymm10", "%ymm11", "%ymm12", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+    &multiply_8x8_mod_Q("%ymm3", "%ymm13", "%ymm3", # A, B, out (AxB)
+                        "%ymm10", "%ymm11", "%ymm12", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+    &multiply_8x8_mod_Q("%ymm7", "%ymm13", "%ymm7", # A, B, out (AxB)
+                        "%ymm10", "%ymm11", "%ymm12", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+
+$code .= <<___;
+
+    vmovdqu %ymm0, $off+0*4(%rdi)
+    vmovdqu %ymm1, $off+32*4(%rdi)
+    vmovdqu %ymm2, $off+64*4(%rdi)
+    vmovdqu %ymm3, $off+96*4(%rdi)
+    vmovdqu %ymm4, $off+128*4(%rdi)
+    vmovdqu %ymm5, $off+160*4(%rdi)
+    vmovdqu %ymm6, $off+192*4(%rdi)
+    vmovdqu %ymm7, $off+224*4(%rdi)
+___
+}
+
+$code .= <<___;
+###############################################################################
+###############################################################################
+### Data section
+
+.section .rodata
+
+###############################################################################
+# zetas_inverse:
+#
+# Description:
+#   Table of inverse NTT "zetas" (twiddle factors), precomputed as powers of the
+#   primitive root of unity (mod Q) required for each stage of the inverse Number
+#   Theoretic Transform (INTT). Entries represent the modular inverses of the
+#   forward NTT zetas (ml_dsa_ntt.c), arranged in bit-reversed or stage-order
+#   as required by the INTT implementation.
+#   Elements of the table are in reversed order vs the original table for the
+#   forward NTT. This is reduce permute operations when preparing zetas in the
+#   correct format for butterfly operations.
+#
+#   Each .long entry corresponds to a 32-bit modular value (Q - zeta) for the
+#   appropriate stage and index.
+#
+#   The exact arrangement and computation of these zetas matches the INTT
+#   schedule and must be consistent with the NTT forward transform (ml_dsa_ntt.c).
+###############################################################################
+.align 64
+zetas_inverse:
+    .long $ML_DSA_Q - 1976782, $ML_DSA_Q - 7534263, $ML_DSA_Q - 1400424, $ML_DSA_Q - 3937738, $ML_DSA_Q - 7018208, $ML_DSA_Q - 8332111, $ML_DSA_Q - 3919660, $ML_DSA_Q - 7826001
+    .long $ML_DSA_Q - 4834730, $ML_DSA_Q - 1612842, $ML_DSA_Q - 7403526, $ML_DSA_Q - 183443,  $ML_DSA_Q - 6094090, $ML_DSA_Q - 7959518, $ML_DSA_Q - 6144432, $ML_DSA_Q - 5441381
+    .long $ML_DSA_Q - 4546524, $ML_DSA_Q - 8119771, $ML_DSA_Q - 7276084, $ML_DSA_Q - 6712985, $ML_DSA_Q - 1910376, $ML_DSA_Q - 6577327, $ML_DSA_Q - 1723600, $ML_DSA_Q - 7953734
+    .long $ML_DSA_Q - 472078,  $ML_DSA_Q - 1717735, $ML_DSA_Q - 7404533, $ML_DSA_Q - 2213111, $ML_DSA_Q - 269760,  $ML_DSA_Q - 3866901, $ML_DSA_Q - 3523897, $ML_DSA_Q - 5341501
+    .long $ML_DSA_Q - 6581310, $ML_DSA_Q - 4686184, $ML_DSA_Q - 1652634, $ML_DSA_Q - 810149,  $ML_DSA_Q - 3014001, $ML_DSA_Q - 1616392, $ML_DSA_Q - 162844,  $ML_DSA_Q - 5196991
+    .long $ML_DSA_Q - 7173032, $ML_DSA_Q - 185531,  $ML_DSA_Q - 3369112, $ML_DSA_Q - 1957272, $ML_DSA_Q - 8215696, $ML_DSA_Q - 2454455, $ML_DSA_Q - 2432395, $ML_DSA_Q - 6366809
+    .long $ML_DSA_Q - 4603424, $ML_DSA_Q - 594136,  $ML_DSA_Q - 4656147, $ML_DSA_Q - 5796124, $ML_DSA_Q - 6533464, $ML_DSA_Q - 6709241, $ML_DSA_Q - 5548557, $ML_DSA_Q - 7838005
+    .long $ML_DSA_Q - 3406031, $ML_DSA_Q - 2235880, $ML_DSA_Q - 777191,  $ML_DSA_Q - 1500165, $ML_DSA_Q - 7005614, $ML_DSA_Q - 5834105, $ML_DSA_Q - 1917081, $ML_DSA_Q - 7100756
+    .long $ML_DSA_Q - 6417775, $ML_DSA_Q - 3306115, $ML_DSA_Q - 1312455, $ML_DSA_Q - 7929317, $ML_DSA_Q - 6950192, $ML_DSA_Q - 5062207, $ML_DSA_Q - 1237275, $ML_DSA_Q - 7047359
+    .long $ML_DSA_Q - 7329447, $ML_DSA_Q - 1903435, $ML_DSA_Q - 1869119, $ML_DSA_Q - 5386378, $ML_DSA_Q - 4832145, $ML_DSA_Q - 2635921, $ML_DSA_Q - 1250494, $ML_DSA_Q - 4613401
+    .long $ML_DSA_Q - 1595974, $ML_DSA_Q - 2486353, $ML_DSA_Q - 1247620, $ML_DSA_Q - 4055324, $ML_DSA_Q - 1265009, $ML_DSA_Q - 5790267, $ML_DSA_Q - 2691481, $ML_DSA_Q - 2842341
+    .long $ML_DSA_Q - 203044,  $ML_DSA_Q - 1735879, $ML_DSA_Q - 5038140, $ML_DSA_Q - 3437287, $ML_DSA_Q - 4108315, $ML_DSA_Q - 5942594, $ML_DSA_Q - 286988,  $ML_DSA_Q - 342297
+    .long $ML_DSA_Q - 4784579, $ML_DSA_Q - 7611795, $ML_DSA_Q - 7855319, $ML_DSA_Q - 4823422, $ML_DSA_Q - 3207046, $ML_DSA_Q - 2031748, $ML_DSA_Q - 5257975, $ML_DSA_Q - 7725090
+    .long $ML_DSA_Q - 7857917, $ML_DSA_Q - 8337157, $ML_DSA_Q - 6767243, $ML_DSA_Q - 495491,  $ML_DSA_Q - 819034,  $ML_DSA_Q - 909542,  $ML_DSA_Q - 1859098, $ML_DSA_Q - 900702
+    .long $ML_DSA_Q - 5187039, $ML_DSA_Q - 7183191, $ML_DSA_Q - 4621053, $ML_DSA_Q - 4860065, $ML_DSA_Q - 3513181, $ML_DSA_Q - 7144689, $ML_DSA_Q - 2434439, $ML_DSA_Q - 266997
+    .long $ML_DSA_Q - 4817955, $ML_DSA_Q - 5933984, $ML_DSA_Q - 2244091, $ML_DSA_Q - 5037939, $ML_DSA_Q - 3817976, $ML_DSA_Q - 2316500, $ML_DSA_Q - 3407706, $ML_DSA_Q - 2091667
+    .long $ML_DSA_Q - 3839961, $ML_DSA_Q - 4751448, $ML_DSA_Q - 4499357, $ML_DSA_Q - 5361315, $ML_DSA_Q - 6940675, $ML_DSA_Q - 7567685, $ML_DSA_Q - 6795489, $ML_DSA_Q - 1285669
+    .long $ML_DSA_Q - 1341330, $ML_DSA_Q - 1315589, $ML_DSA_Q - 8202977, $ML_DSA_Q - 5971092, $ML_DSA_Q - 6529015, $ML_DSA_Q - 3159746, $ML_DSA_Q - 4827145, $ML_DSA_Q - 189548
+    .long $ML_DSA_Q - 7063561, $ML_DSA_Q - 759969,  $ML_DSA_Q - 8169440, $ML_DSA_Q - 2389356, $ML_DSA_Q - 5130689, $ML_DSA_Q - 1653064, $ML_DSA_Q - 8371839, $ML_DSA_Q - 4656075
+    .long $ML_DSA_Q - 3958618, $ML_DSA_Q - 904516,  $ML_DSA_Q - 7280319, $ML_DSA_Q - 44288,   $ML_DSA_Q - 3097992, $ML_DSA_Q - 508951,  $ML_DSA_Q - 264944,  $ML_DSA_Q - 5037034
+    .long $ML_DSA_Q - 6949987, $ML_DSA_Q - 1852771, $ML_DSA_Q - 1349076, $ML_DSA_Q - 7998430, $ML_DSA_Q - 7072248, $ML_DSA_Q - 8357436, $ML_DSA_Q - 7151892, $ML_DSA_Q - 7709315
+    .long $ML_DSA_Q - 5903370, $ML_DSA_Q - 7969390, $ML_DSA_Q - 4686924, $ML_DSA_Q - 5412772, $ML_DSA_Q - 2715295, $ML_DSA_Q - 2147896, $ML_DSA_Q - 7396998, $ML_DSA_Q - 3412210
+    .long $ML_DSA_Q - 126922,  $ML_DSA_Q - 4747489, $ML_DSA_Q - 5223087, $ML_DSA_Q - 5190273, $ML_DSA_Q - 7380215, $ML_DSA_Q - 4296819, $ML_DSA_Q - 1939314, $ML_DSA_Q - 7122806
+    .long $ML_DSA_Q - 6795196, $ML_DSA_Q - 2176455, $ML_DSA_Q - 3475950, $ML_DSA_Q - 6927966, $ML_DSA_Q - 5339162, $ML_DSA_Q - 4702672, $ML_DSA_Q - 6851714, $ML_DSA_Q - 4450022
+    .long $ML_DSA_Q - 5582638, $ML_DSA_Q - 2071892, $ML_DSA_Q - 5823537, $ML_DSA_Q - 3900724, $ML_DSA_Q - 3881043, $ML_DSA_Q - 954230,  $ML_DSA_Q - 531354,  $ML_DSA_Q - 811944
+    .long $ML_DSA_Q - 3699596, $ML_DSA_Q - 6779997, $ML_DSA_Q - 6239768, $ML_DSA_Q - 3507263, $ML_DSA_Q - 4558682, $ML_DSA_Q - 3505694, $ML_DSA_Q - 6736599, $ML_DSA_Q - 6681150
+    .long $ML_DSA_Q - 7841118, $ML_DSA_Q - 2348700, $ML_DSA_Q - 8079950, $ML_DSA_Q - 3539968, $ML_DSA_Q - 5512770, $ML_DSA_Q - 3574422, $ML_DSA_Q - 5336701, $ML_DSA_Q - 4519302
+    .long $ML_DSA_Q - 3915439, $ML_DSA_Q - 5842901, $ML_DSA_Q - 4788269, $ML_DSA_Q - 6718724, $ML_DSA_Q - 3530437, $ML_DSA_Q - 3077325, $ML_DSA_Q - 95776,   $ML_DSA_Q - 2706023
+    .long $ML_DSA_Q - 280005,  $ML_DSA_Q - 4010497, $ML_DSA_Q - 8360995, $ML_DSA_Q - 1757237, $ML_DSA_Q - 5102745, $ML_DSA_Q - 6980856, $ML_DSA_Q - 4520680, $ML_DSA_Q - 6262231
+    .long $ML_DSA_Q - 6271868, $ML_DSA_Q - 2619752, $ML_DSA_Q - 7260833, $ML_DSA_Q - 7830929, $ML_DSA_Q - 3585928, $ML_DSA_Q - 7300517, $ML_DSA_Q - 1024112, $ML_DSA_Q - 2725464
+    .long $ML_DSA_Q - 2680103, $ML_DSA_Q - 3111497, $ML_DSA_Q - 5495562, $ML_DSA_Q - 3119733, $ML_DSA_Q - 6288512, $ML_DSA_Q - 8021166, $ML_DSA_Q - 2353451, $ML_DSA_Q - 1826347
+    .long $ML_DSA_Q - 466468,  $ML_DSA_Q - 7504169, $ML_DSA_Q - 7602457, $ML_DSA_Q - 237124,  $ML_DSA_Q - 7861508, $ML_DSA_Q - 5771523, $ML_DSA_Q - 25847,   $ML_DSA_Q - 4193792
+
+.align 32
+idx_even:
+    .long 0,2,4,6, 0,2,4,6
+
+.align 32
+idx_odd:
+    .long 1,3,5,7, 1,3,5,7
+
+# Modulus Q for ML-DSA NTT (2^23 - 2^13 + 1)
+.align 8
+ml_dsa_q:
+    .quad $ML_DSA_Q
+
+# -Q^{-1} mod 2^32 (Montgomery parameter for ML-DSA modular reduction)
+.align 8
+ml_dsa_q_neg_inv:
+    .quad $ML_DSA_Q_NEG_INV
+
+# (N^{-1} mod Q) in Montgomery form for scaling after inverse NTT
+.align 8
+ml_dsa_inverse_degree_montgomery:
+    .quad $inverse_degree_montgomery
+
+###############################################################################
+###############################################################################
+### Code section
+
+.text
+
+###############################################################################
+# ml_dsa_poly_ntt_mult_avx2
+#
+# C Prototype:
+#   void ml_dsa_poly_ntt_mult_avx2(
+#       const uint32_t *a,       // (rdi) Input polynomial A (in NTT domain)
+#       const uint32_t *b,       // (rsi) Input polynomial B (in NTT domain)
+#       uint32_t *out,           // (rdx) Output polynomial (result)
+#   );
+#
+# Description:
+#   Top-level routine for polynomial multiplication in ML-DSA,
+#   using number-theoretic transform (NTT) methods. This function performs
+#   multiplication of two polynomials in the NTT domain, making full use of
+#   AVX2 vector instructions.
+#   It assumes there are 256 coefficients.
+#
+#   out[i] = a[i] x b[i] mod Q
+#
+#   The function:
+#     - Takes pointers to source polynomials (NTT domain) and destination buffer
+#     - Performs element-wise modular (pointwise) multiplication in the NTT domain
+#     - Applies Montgomery reduction for efficient modular arithmetic
+#
+# Inputs:
+#   a   - First input polynomial, NTT domain
+#   b   - Second input polynomial, NTT domain
+#   out - Destination buffer for output coefficients
+#
+# Output:
+#   - Output buffer 'out' contains the coefficient-wise modular product of
+#     the input polynomials (still in NTT domain)
+#
+###############################################################################
+
+.globl  ml_dsa_poly_ntt_mult_avx2
+.type   ml_dsa_poly_ntt_mult_avx2,\@function,3
+.align 32
+ml_dsa_poly_ntt_mult_avx2:
+.cfi_startproc
+    vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
+    vpbroadcastd ml_dsa_q(%rip), %ymm15
+    xor %r10d, %r10d
+
+.align 32
+.Lmult_loop:
+    # Load a and b into ymm registers
+    vmovdqu (%rdi,%r10), %ymm0   # a[0:7]
+    vmovdqu (%rsi,%r10), %ymm1   # b[0:7]
+
+    # multiply this part of input data
+___
+
+    &multiply_8x8_mod_Q("%ymm0", "%ymm1", "%ymm0",  # A, B, out (AxB)
+                        "%ymm8", "%ymm9", "%ymm10", # tmp
+                        "%ymm14", "%ymm15");        # qinv, q
+
+$code .= <<___;
+    # store result to output
+    vmovdqu %ymm0, (%rdx,%r10)
+
+    # start new iteration
+    add \$8*4, %r10d
+    cmp \$256*4, %r10d
+    jb .Lmult_loop
+
+    vzeroall
+    ret
+.cfi_endproc
+.size   ml_dsa_poly_ntt_mult_avx2, .-ml_dsa_poly_ntt_mult_avx2
+
+###############################################################################
+# ml_dsa_poly_ntt_avx2
+#
+# C Prototype:
+#   void ml_dsa_poly_ntt_avx2(
+#       uint32_t *p_coeffs,     // Pointer to coefficients (input: normal domain, output: NTT domain)
+#       const uint32_t *p_zetas // Pointer to zeta (twiddle factor) table for the forward NTT
+#   );
+#
+# Description:
+#   Top-level implementation of the forward Number Theoretic
+#   Transform (NTT) for ML-DSA polynomials. This function converts a polynomial
+#   from its standard coefficient (normal) form to its NTT representation,
+#   storing the result in-place in the provided coefficients array. The function
+#   uses stage-specific "zeta" (twiddle factor) tables passed from C (ml_dsa_ntt.c).
+#
+#   The function:
+#     - Takes a buffer of polynomial coefficients in normal (standard) order
+#     - Uses the provided zeta table for all twiddle-factor multiplications
+#     - Processes the NTT in a breadth-first, layered fashion with AVX2 SIMD
+#     - Overwrites the input buffer with its NTT-domain representation
+#
+# Inputs:
+#   p_coeffs - Pointer to the coefficient array (will be overwritten in-place by the NTT result)
+#   p_zetas  - Pointer to the precomputed table of forward NTT zeta (twiddle) factors (from ml_dsa_ntt.c)
+#
+# Output:
+#   - The 'p_coeffs' array is updated in-place with the corresponding NTT-domain representation.
+###############################################################################
+.globl  ml_dsa_poly_ntt_avx2
+.type   ml_dsa_poly_ntt_avx2,\@function,2
+.align 32
+ml_dsa_poly_ntt_avx2:
+.cfi_startproc
+
+    sub \$32, %rsp
+.cfi_adjust_cfa_offset 32   # track rsp change so unwinder can find CFA
+
+    # save input arguments
+    mov %rdi, %r10
+    mov %rsi, %r11
+
+    # load constants
+    vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
+    vpbroadcastd ml_dsa_q(%rip), %ymm15     # 32-bit Q
+
+    # ==============================================================
+    # - level 0: offset = 128, step = 1, zeta indexes = 1
+    # - level 1: offset = 64, step = 2, zeta indexes = 2, 3
+    # - level 2: offset = 32, step = 4, zeta indexes = 4, 5, 6, 7
+
+    mov %r10, %rdi                  # p_coeffs
+___
+
+    &ntt_levels0to2(0*4);
+    &ntt_levels0to2(8*4);
+    &ntt_levels0to2(16*4);
+    &ntt_levels0to2(24*4);
+
+$code .= <<___;
+
+    # ==============================================================
+    # - level 3: offset = 16, step = 8
+    #     zeta indexes = 8, 9, 10, 11, 12, 13, 14, 15
+    # - level 4: offset = 8, step = 16
+    #     zeta indexes = 16, 17, 18, ..., 30, 31
+    # - level 5: offset = 4, step = 32
+    #     zeta indexes = 32, 33, 34, ..., 62, 63
+    # - level 6: offset = 2, step = 64
+    #     zeta indexes = 64, 65, 66, ..., 126, 127
+    # - level 7: offset = 1, step = 128
+    #     zeta indexes = 128, 129, 130, ..., 254, 255
+
+    mov %r10, %rdi                  # p_even / p_coeff
+___
+
+    # arguments:    coeff,   l3,    l4,    l5,    l6,    l7
+    &ntt_levels3to7(  0*4,  8*4,  16*4,  32*4,  64*4, 128*4);
+    &ntt_levels3to7( 64*4, 10*4,  20*4,  40*4,  80*4, 160*4);
+    &ntt_levels3to7(128*4, 12*4,  24*4,  48*4,  96*4, 192*4);
+    &ntt_levels3to7(192*4, 14*4,  28*4,  56*4, 112*4, 224*4);
+
+$code .= <<___;
+
+    vzeroall
+
+    lea 32(%rsp), %rsp
+.cfi_adjust_cfa_offset -32
+
+    ret
+.cfi_endproc
+.size   ml_dsa_poly_ntt_avx2, .-ml_dsa_poly_ntt_avx2
+
+###############################################################################
+# ml_dsa_poly_ntt_inverse_avx2
+#
+# C Prototype:
+#     void ml_dsa_poly_ntt_inverse_avx2(
+#        uint32_t *p_coeffs // (rdi) Pointer to coefficients
+#                           // input: NTT domain, output: normal domain, in-place
+#     );
+#
+# Description:
+#   Top-level implementation of the inverse Number Theoretic
+#   Transform (INTT) for ML-DSA polynomial. This function converts a polynomial
+#   from its NTT domain back to the standard coefficient (normal) domain,
+#   storing the result in-place in the provided buffer. The required inverse zeta
+#   (twiddle) factors are managed internally.
+#
+#   The function:
+#     - Accepts a buffer of NTT-domain coefficients
+#     - Overwrites the input buffer with the result in the normal (coefficient) domain
+#
+# Inputs:
+#   p_coeffs - Pointer to the polynomial coefficient array (in-place transform)
+#
+# Output:
+#   - The 'p_coeffs' array is updated in-place to contain the standard domain polynomial.
+#   - Uses 'zetas_inverse' table
+###############################################################################
+.globl  ml_dsa_poly_ntt_inverse_avx2
+.type   ml_dsa_poly_ntt_inverse_avx2,\@function,1
+.align 32
+ml_dsa_poly_ntt_inverse_avx2:
+.cfi_startproc
+    lea zetas_inverse(%rip), %r11
+
+    vpbroadcastq ml_dsa_q_neg_inv(%rip), %ymm14
+    vpbroadcastd ml_dsa_q(%rip), %ymm15
+
+    # ==============================================================
+    # - level 0: offset = 1, step = 128
+    #     zeta indexes (original table) = 255, 254, 253, ... 129, 128
+    #     zeta indexes (new table) = 0, 1, 2, .. 127
+    # - level 1: offset = 2, step = 64
+    #     zeta indexes (original table) = 127, 126, 125, ... 65, 64
+    #     zeta indexes (new table) = 128, 129, .. 191
+    # - level 2: offset = 4, step = 32
+    #     zeta indexes (original table) = 63, 62, 61, ... 33, 32
+    #     zeta indexes (new table) = 192, 193, 194, ... 223
+    # - level 3: offset = 8, step = 16
+    #     zeta indexes (original table) = 31, 30, 29, ... 17, 16
+    #     zeta indexes (new table) = 224, 225, 226, ... 239
+    # - level 4: offset = 16, step = 8
+    #     zeta indexes (original table) = 15, 14, 13, ..., 9, 8
+    #     zeta indexes (new table) = 240, 241, 242, ... 247
+
+___
+
+    #  arguments:    coeff,   l0,    l1,    l2,    l3,    l4
+    &intt_levels0to4(0*4,    0*4, 128*4, 192*4, 224*4, 240*4);
+    &intt_levels0to4(64*4,  32*4, 144*4, 200*4, 228*4, 242*4);
+    &intt_levels0to4(128*4, 64*4, 160*4, 208*4, 232*4, 244*4);
+    &intt_levels0to4(192*4, 96*4, 176*4, 216*4, 236*4, 246*4);
+
+$code .= <<___;
+
+    # ==============================================================
+    # - level 5: offset = 32, step = 4
+    #   zeta indexes (original table) = 7, 6, 5, 4
+    #   zeta indexes (new table) = 248, 249, 250, 251
+    # - level 6: offset = 64, step = 2
+    #   zeta indexes (original table) = 3, 2
+    #   zeta indexes (new table) = 252, 253
+    # - level 7: offset = 128, step = 1
+    #   zeta indexes (original table) = 1
+    #   zeta indexes (new table) = 254
+___
+    &intt_levels5to7(0*4);
+    &intt_levels5to7(8*4);
+    &intt_levels5to7(16*4);
+    &intt_levels5to7(24*4);
+    $code .= <<___;
+
+    vzeroall
+    ret
+.cfi_endproc
+.size   ml_dsa_poly_ntt_inverse_avx2, .-ml_dsa_poly_ntt_inverse_avx2
+___
+
+}}} else {{{
+# When AVX2 is not available, output stub functions
+# The capable function returns 0, and the operation functions trap if called
+$code .= <<___;
+.text
+
+.globl  ml_dsa_ntt_avx2_capable
+.type   ml_dsa_ntt_avx2_capable,\@abi-omnipotent
+ml_dsa_ntt_avx2_capable:
+    xor     %eax, %eax
+    ret
+.size   ml_dsa_ntt_avx2_capable, .-ml_dsa_ntt_avx2_capable
+
+.globl  ml_dsa_poly_ntt_mult_avx2
+.globl  ml_dsa_poly_ntt_avx2
+.globl  ml_dsa_poly_ntt_inverse_avx2
+.type   ml_dsa_poly_ntt_mult_avx2,\@abi-omnipotent
+ml_dsa_poly_ntt_mult_avx2:
+ml_dsa_poly_ntt_avx2:
+ml_dsa_poly_ntt_inverse_avx2:
+    .byte   0x0f,0x0b       # ud2
+    ret
+.size   ml_dsa_poly_ntt_mult_avx2, .-ml_dsa_poly_ntt_mult_avx2
+___
+}}}
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/ml_dsa/build.info b/crypto/ml_dsa/build.info
index a0aee56f5a..937a8a2f66 100644
--- a/crypto/ml_dsa/build.info
+++ b/crypto/ml_dsa/build.info
@@ -4,7 +4,19 @@ $COMMON=ml_dsa_encoders.c ml_dsa_key_compress.c ml_dsa_key.c \
         ml_dsa_matrix.c ml_dsa_ntt.c ml_dsa_params.c ml_dsa_sample.c \
         ml_dsa_sign.c

+$ML_DSA_ASM=
+IF[{- !$disabled{asm} -}]
+  $ML_DSA_ASM_x86_64=ml_dsa_ntt-x86_64.s
+
+  IF[$ML_DSA_ASM_{- $target{asm_arch} -}]
+    $ML_DSA_ASM=$ML_DSA_ASM_{- $target{asm_arch} -}
+  ENDIF
+ENDIF
+
 IF[{- !$disabled{'ml-dsa'} -}]
-  SOURCE[../../libcrypto]=$COMMON
-  SOURCE[../../providers/libfips.a]=$COMMON
+  SOURCE[../../libcrypto]=$COMMON $ML_DSA_ASM
+  SOURCE[../../providers/libfips.a]=$COMMON $ML_DSA_ASM
 ENDIF
+
+# Assembly implementations
+GENERATE[ml_dsa_ntt-x86_64.s]=asm/ml_dsa_ntt-x86_64.pl
diff --git a/crypto/ml_dsa/ml_dsa_ntt.c b/crypto/ml_dsa/ml_dsa_ntt.c
index 2cce462292..41e8b27e81 100644
--- a/crypto/ml_dsa/ml_dsa_ntt.c
+++ b/crypto/ml_dsa/ml_dsa_ntt.c
@@ -10,6 +10,14 @@
 #include "ml_dsa_local.h"
 #include "ml_dsa_poly.h"

+/* Assembly function declarations for AVX2 implementations */
+#if !defined(OPENSSL_NO_ASM) && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64))
+int ml_dsa_ntt_avx2_capable(void);
+void ml_dsa_poly_ntt_avx2(uint32_t *p_coeff, const uint32_t *p_zetas);
+void ml_dsa_poly_ntt_inverse_avx2(uint32_t *p_coeff);
+void ml_dsa_poly_ntt_mult_avx2(const uint32_t *a, const uint32_t *b, uint32_t *out);
+#endif
+
 /*
  * This file has multiple parts required for fast matrix multiplication,
  * 1) NTT (See https://eprint.iacr.org/2024/585.pdf)
@@ -112,6 +120,13 @@ void ossl_ml_dsa_poly_ntt_mult(const POLY *lhs, const POLY *rhs, POLY *out)
 {
     int i;

+#if !defined(OPENSSL_NO_ASM) && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64))
+    if (ml_dsa_ntt_avx2_capable()) {
+        ml_dsa_poly_ntt_mult_avx2(&lhs->coeff[0], &rhs->coeff[0], &out->coeff[0]);
+        return;
+    }
+#endif
+
     for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++)
         out->coeff[i] = reduce_montgomery((uint64_t)lhs->coeff[i] * (uint64_t)rhs->coeff[i]);
 }
@@ -131,6 +146,14 @@ void ossl_ml_dsa_poly_ntt(POLY *p)
     int step;
     int offset = ML_DSA_NUM_POLY_COEFFICIENTS;

+#if !defined(OPENSSL_NO_ASM) && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64))
+    if (ml_dsa_ntt_avx2_capable()) {
+        ml_dsa_poly_ntt_avx2(&p->coeff[0], zetas_montgomery);
+        return;
+    }
+#endif
+
+    /* Fallback implementation */
     /* Step: 1, 2, 4, 8, ..., 128 */
     for (step = 1; step < ML_DSA_NUM_POLY_COEFFICIENTS; step <<= 1) {
         k = 0;
@@ -171,6 +194,14 @@ void ossl_ml_dsa_poly_ntt_inverse(POLY *p)
      */
     static const uint32_t inverse_degree_montgomery = 41978;

+#if !defined(OPENSSL_NO_ASM) && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64))
+    if (ml_dsa_ntt_avx2_capable()) {
+        ml_dsa_poly_ntt_inverse_avx2(&p->coeff[0]);
+        return;
+    }
+#endif
+
+    /* Fallback implementation */
     for (offset = 1; offset < ML_DSA_NUM_POLY_COEFFICIENTS; offset <<= 1) {
         step >>= 1;
         k = 0;