Commit 24486294f7 for openssl.org
commit 24486294f7e8ec69318c3608edb191d7377735f1
Author: Timo Keller <tkeller@linux.ibm.com>
Date: Mon Mar 16 07:18:26 2026 +0100
Vectorize (inverse) NTT in ML-DSA
The vectorization is implemented using vector extensions (of gcc/clang)
and will work on any architecture with 128 bit vector registers that has
the builtin `vec_mulh` for the high part of a multiplication.
Enable this for s390x.
The speed-up factor on z17 is around 2--3.4.
Signed-off-by: Timo Keller <tkeller@linux.ibm.com>
Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
Reviewed-by: Saša NedvÄ›dický <sashan@openssl.org>
MergeDate: Wed Jun 10 09:25:58 2026
(Merged from https://github.com/openssl/openssl/pull/30812)
diff --git a/CHANGES.md b/CHANGES.md
index 08bf6045bb..eaeef66935 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -178,6 +178,11 @@ OpenSSL Releases
*Norbert Pócs*
+ * Added optimized ML-DSA NTT operations on `s390x`
+ (or other architectures with 128 bit vector registers).
+
+ *Timo Keller*
+
### Changes between 3.6 and 4.0.0 [14 Apr 2026]
* Added `-expected-rpks` option to the `openssl s_client`
diff --git a/crypto/ml_dsa/build.info b/crypto/ml_dsa/build.info
index 937a8a2f66..e41867f573 100644
--- a/crypto/ml_dsa/build.info
+++ b/crypto/ml_dsa/build.info
@@ -14,8 +14,25 @@ IF[{- !$disabled{asm} -}]
ENDIF
IF[{- !$disabled{'ml-dsa'} -}]
- SOURCE[../../libcrypto]=$COMMON $ML_DSA_ASM
- SOURCE[../../providers/libfips.a]=$COMMON $ML_DSA_ASM
+ IF[{- ($target{perlasm_scheme} // '') ne '31' -}]
+ $ML_DSA_VX_s390x=ml_dsa_ntt_vec128.c
+ $ML_DSA_DEF_s390x=OPENSSL_ML_DSA_S390X
+ ENDIF
+
+ # Now that we have defined all the arch specific variables, use the
+ # appropriate ones, and define the appropriate macros
+ IF[$ML_DSA_VX_{- $target{asm_arch} -}]
+ $ML_DSA_VX=$ML_DSA_VX_{- $target{asm_arch} -}
+ $ML_DSA_DEF=$ML_DSA_DEF_{- $target{asm_arch} -}
+ ENDIF
+ENDIF
+
+DEFINE[../../libcrypto]=$ML_DSA_DEF
+DEFINE[../../providers/libfips.a]=$ML_DSA_DEF
+
+IF[{- !$disabled{'ml-dsa'} -}]
+ SOURCE[../../libcrypto]=$COMMON $ML_DSA_ASM $ML_DSA_VX
+ SOURCE[../../providers/libfips.a]=$COMMON $ML_DSA_ASM $ML_DSA_VX
ENDIF
# Assembly implementations
diff --git a/crypto/ml_dsa/ml_dsa_local.h b/crypto/ml_dsa/ml_dsa_local.h
index bbaa6dafc7..9d01856ce3 100644
--- a/crypto/ml_dsa/ml_dsa_local.h
+++ b/crypto/ml_dsa/ml_dsa_local.h
@@ -76,6 +76,16 @@ void ossl_ml_dsa_poly_ntt(POLY *s);
void ossl_ml_dsa_poly_ntt_inverse(POLY *s);
void ossl_ml_dsa_poly_ntt_mult(const POLY *lhs, const POLY *rhs, POLY *out);
+/* Optimization for s390x */
+/* z13 supports VX, z14 supports VXE; z14 means __ARCH__ == 12 */
+#if defined(OPENSSL_ML_DSA_S390X) && defined(__s390x__) && (__ARCH__ >= 12) && defined(__VX__)
+#include "arch/s390x_arch.h"
+#define VX_COMPILER_SUPPORT_VEC128
+void ossl_ml_dsa_poly_ntt_vec128(POLY *p);
+void ossl_ml_dsa_poly_ntt_inverse_vec128(POLY *p);
+void ossl_poly_ntt_mult_scalar_vec128(const POLY *lhs, const POLY *rhs, POLY *out);
+#endif
+
void ossl_ml_dsa_key_compress_power2_round(uint32_t r, uint32_t *r1, uint32_t *r0);
uint32_t ossl_ml_dsa_key_compress_high_bits(uint32_t r, uint32_t gamma2);
void ossl_ml_dsa_key_compress_decompose(uint32_t r, uint32_t gamma2,
diff --git a/crypto/ml_dsa/ml_dsa_ntt.c b/crypto/ml_dsa/ml_dsa_ntt.c
index dc81f81822..3f5ebd5206 100644
--- a/crypto/ml_dsa/ml_dsa_ntt.c
+++ b/crypto/ml_dsa/ml_dsa_ntt.c
@@ -240,6 +240,13 @@ static void ml_dsa_ntt_init(void)
poly_ntt_mult_impl = poly_ntt_mult_avx2_wrapper;
}
#endif
+#ifdef VX_COMPILER_SUPPORT_VEC128
+ if (S390X_VX_CAPABLE) {
+ poly_ntt_impl = ossl_ml_dsa_poly_ntt_vec128;
+ poly_ntt_inverse_impl = ossl_ml_dsa_poly_ntt_inverse_vec128;
+ poly_ntt_mult_impl = ossl_poly_ntt_mult_scalar_vec128;
+ }
+#endif
}
/*
diff --git a/crypto/ml_dsa/ml_dsa_ntt_vec128.c b/crypto/ml_dsa/ml_dsa_ntt_vec128.c
new file mode 100644
index 0000000000..8176cbcdfb
--- /dev/null
+++ b/crypto/ml_dsa/ml_dsa_ntt_vec128.c
@@ -0,0 +1,693 @@
+/*
+ * Copyright 2024-2026 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include "ml_dsa_local.h"
+#include "ml_dsa_poly.h"
+
+#if defined(OPENSSL_ML_DSA_S390X) && defined(__s390x__) && (__ARCH__ >= 12) && defined(__VX__)
+
+#include <vecintrin.h>
+
+#include <stdint.h>
+
+/* Width of vector registers in bytes */
+#define VECTOR_REG_WIDTH_BYTES 16
+/*
+ * __may_alias__ solves the undefined behavior problem in code like
+ * vec_int32_t *out_vec_ptr = (vec_int32_t *)out->coeff;
+ */
+typedef int32_t vec_int32_t __attribute__((vector_size(VECTOR_REG_WIDTH_BYTES), __may_alias__));
+typedef uint32_t vec_uint32_t __attribute__((vector_size(VECTOR_REG_WIDTH_BYTES), __may_alias__));
+
+typedef int32_t vec_int32_alias_t __attribute__((vector_size(VECTOR_REG_WIDTH_BYTES)));
+typedef uint32_t vec_uint32_alias_t __attribute__((vector_size(VECTOR_REG_WIDTH_BYTES)));
+
+/* Our implementation of the vectorized algorithms assumes NUM_INT32_IN_VECTOR == 4. */
+#define NUM_INT32_IN_VECTOR (VECTOR_REG_WIDTH_BYTES / ((int)sizeof(int32_t)))
+
+/*
+ * This file has multiple parts required for fast matrix multiplication,
+ * 1) NTT (See https://eprint.iacr.org/2024/585.pdf)
+ * NTT and NTT inverse transformations are Discrete Fourier Transforms in a
+ * polynomial ring. Fast-Fourier Transformations can then be applied to make
+ * multiplications n log(n). This uses the symmetry of the transformation to
+ * reduce computations.
+ *
+ * 2) Montgomery multiplication
+ * The multiplication of a.b mod q requires division by q which is a slow operation.
+ *
+ * When many multiplications mod q are required montgomery multiplication
+ * can be used. This requires a number R > q such that R & q are coprime
+ * (i.e. GCD(R, q) = 1), so that division happens using R instead of q.
+ * If r is a power of 2 then this division can be done as a bit shift.
+ *
+ * Given that q = 2^23 - 2^13 + 1
+ * We can chose a Montgomery multiplier of R = 2^32.
+ *
+ * To transform |a| into Montgomery form |m| we use
+ * m = a mod q * ((2^32)*(2^32) mod q)
+ * which is then Montgomery reduced, removing the excess factor of R = 2^32.
+ *
+ * A good reference for optimizations around ML-DSA and Montgomery multiplication is
+ * [Seiler 2018, Faster AVX2 optimized NTT multiplication for Ring-LWE lattice cryptography].
+ */
+
+/*
+ * The table in FIPS 204 Appendix B uses the following formula
+ * zeta[k]= 1753^bitrev(k) mod q for (k = 1..255) (The first value is not used).
+ *
+ * As this implementation uses montgomery form with a multiplier of 2^32.
+ * The values need to be transformed i.e.
+ *
+ * zetasMontgomery[k] = reduce_montgomery(zeta[k] * (2^32 * 2^32 mod(q)))
+ * reduce_montgomery() is defined below.
+ */
+static const int32_t zetas_montgomery[256] = {
+ 4193792, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468,
+ 1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103,
+ 2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868,
+ 6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005,
+ 2706023, 95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439,
+ 4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118,
+ 6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596,
+ 811944, 531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638,
+ 4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196,
+ 7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922,
+ 3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370,
+ 7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987,
+ 5037034, 264944, 508951, 3097992, 44288, 7280319, 904516, 3958618,
+ 4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969, 7063561,
+ 189548, 4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330,
+ 1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961,
+ 2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955,
+ 266997, 2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039,
+ 900702, 1859098, 909542, 819034, 495491, 6767243, 8337157, 7857917,
+ 7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579,
+ 342297, 286988, 5942594, 4108315, 3437287, 5038140, 1735879, 203044,
+ 2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974,
+ 4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447,
+ 7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775,
+ 7100756, 1917081, 5834105, 7005614, 1500165, 777191, 2235880, 3406031,
+ 7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136, 4603424,
+ 6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531, 7173032,
+ 5196991, 162844, 1616392, 3014001, 810149, 1652634, 4686184, 6581310,
+ 5341501, 3523897, 3866901, 269760, 2213111, 7404533, 1717735, 472078,
+ 7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524,
+ 5441381, 6144432, 7959518, 6094090, 183443, 7403526, 1612842, 4834730,
+ 7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782
+};
+
+/* clang-format off */
+static const int32_t zetas_montgomery_twisted[256] = {
+ -512, 1830765815, -1929875197, -1927777020,
+ 1640767044, 1477910809, 1612161321, 1640734244,
+ 308362795, -1815525077, -1374673746, -1091570560,
+ -1929495947, 515185418, -285697463, 625853735,
+ 1727305304, 2082316400, -1364982363, 858240904,
+ 1806278033, 222489249, -346752664, 684667772,
+ 1654287831, -878576920, -1257667336, -748618599,
+ 329347125, 1837364259, -1443016191, -1170414139,
+ -1846138265, -1631226336, -1404529459, 1838055109,
+ 1594295556, -1076973523, -1898723371, -594436433,
+ -202001018, -475984259, -561427818, 1797021250,
+ -1061813248, 2059733582, -1661512036, -1104976546,
+ -1750224322, -901666089, 418987550, 1831915354,
+ -1925356481, 992097816, 879957085, 2024403852,
+ 1484874664, -1636082790, -285388938, -1983539117,
+ -1495136972, -950076367, -1714807468, -952438994,
+ -1574918426, -654783358, 1350681040, -1974159334,
+ -2143979938, 1651689966, 1599739335, 140455868,
+ -1285853322, -1039411342, -993005453, 1955560695,
+ -1440787839, 1529189039, 568627425, -2131021878,
+ -783134478, -247357818, -588790216, 1518161567,
+ 289871780, -86965172, -1262003602, 1708872714,
+ 2135294595, 1787797780, -1018755524, 1638590968,
+ -889861154, -120646188, 1665705315, -1669960605,
+ 1321868266, -916321552, 1225434135, 1155548552,
+ -1784632064, 2143745727, 666258756, 1210558298,
+ 675310539, -1261461889, -1555941048, -318346815,
+ -1999506068, 628664288, -1499481951, -1729304567,
+ -695180180, 1422575625, -1375177022, 1424130039,
+ 1777179796, -1185330463, 334803717, 235321234,
+ -178766299, 168022241, -518252219, 1206536195,
+ 1957047971, 985155485, 1146323032, -894060583,
+ -898413, 991903578, 1363007700, 746144248,
+ -1363460237, 912367099, 30313376, -1420958685,
+ -605900043, -44694137, -326425359, 2032221021,
+ 2027833505, 1176904445, 1683520343, 1904936415,
+ 14253662, -421552614, -517299994, 1257750362,
+ 1014493059, -818371957, 2027935493, 1926727421,
+ 863641634, 1747917559, -1372618620, 1931587462,
+ 1819892094, -325927721, 128353683, 1258381763,
+ 2124962073, 908452108, -1123881662, 885133339,
+ -1223601433, 1851023420, 137583815, 1629985060,
+ -1920467227, -1176751719, -635454917, 1967222129,
+ -1637785316, -1354528380, -642772911, 6363718,
+ -1536588519, -72690498, 45766801, -1287922799,
+ 694382730, -314284737, 671509323, 1136965287,
+ 235104447, 985022747, -2070602177, 1779436848,
+ -1045062171, 963438279, 419615363, 1116720495,
+ 831969620, -1078959975, 1216882041, 1042326958,
+ -300448763, 604552167, -270590488, 1405999311,
+ 756955445, -1021949427, -1276805127, 713994584,
+ -260312804, 608791571, 371462360, 940195360,
+ 1554794073, 173440395, -1357098057, -1542497136,
+ 1339088280, -2126092136, -384158533, 2061661096,
+ -2040058689, -1316619236, 827959816, -883155599,
+ -853476187, -1039370342, -596344472, 1726753854,
+ -2047270595, 6087993, 702390549, -1547952704,
+ -1723816713, -110126091, -279505433, 394851342,
+ -1591599802, 565464272, -260424529, 283780712,
+ -440824167, -1758099916, -71875109, 776003548,
+ 1119856485, -1600929360, -1208667170, 1123958026,
+ 1544891539, 879867910, -1499603926, 201262506,
+ 155290193, -1809756372, 2036925263, 1934038752,
+ -973777462, 400711272, -540420425, 374860238
+};
+
+static const int32_t neg_zetas_montgomery[256] = {
+ 4186625, 8354570, 2608894, 518909,
+ 8143293, 777960, 876248, 7913949,
+ 6554070, 6026966, 359251, 2091905,
+ 5260684, 2884855, 5268920, 5700314,
+ 5654953, 7356305, 1079900, 4794489,
+ 549488, 1119584, 5760665, 2108549,
+ 2118186, 3859737, 1399561, 3277672,
+ 6623180, 19422, 4369920, 8100412,
+ 5674394, 8284641, 5303092, 4849980,
+ 1661693, 3592148, 2537516, 4464978,
+ 3861115, 3043716, 4805995, 2867647,
+ 4840449, 300467, 6031717, 539299,
+ 1699267, 1643818, 4874723, 3821735,
+ 4873154, 2140649, 1600420, 4680821,
+ 7568473, 7849063, 7426187, 4499374,
+ 4479693, 2556880, 6308525, 2797779,
+ 3930395, 1528703, 3677745, 3041255,
+ 1452451, 4904467, 6203962, 1585221,
+ 1257611, 6441103, 4083598, 1000202,
+ 3190144, 3157330, 3632928, 8253495,
+ 4968207, 983419, 6232521, 5665122,
+ 2967645, 3693493, 411027, 2477047,
+ 671102, 1228525, 22981, 1308169,
+ 381987, 7031341, 6527646, 1430430,
+ 3343383, 8115473, 7871466, 5282425,
+ 8336129, 1100098, 7475901, 4421799,
+ 3724342, 8578, 6727353, 3249728,
+ 5991061, 210977, 7620448, 1316856,
+ 8190869, 3553272, 5220671, 1851402,
+ 2409325, 177440, 7064828, 7039087,
+ 7094748, 1584928, 812732, 1439742,
+ 3019102, 3881060, 3628969, 4540456,
+ 6288750, 4972711, 6063917, 4562441,
+ 3342478, 6136326, 2446433, 3562462,
+ 8113420, 5945978, 1235728, 4867236,
+ 3520352, 3759364, 1197226, 3193378,
+ 7479715, 6521319, 7470875, 7561383,
+ 7884926, 1613174, 43260, 522500,
+ 655327, 3122442, 6348669, 5173371,
+ 3556995, 525098, 768622, 3595838,
+ 8038120, 8093429, 2437823, 4272102,
+ 4943130, 3342277, 6644538, 8177373,
+ 5538076, 5688936, 2590150, 7115408,
+ 4325093, 7132797, 5894064, 6784443,
+ 3767016, 7129923, 5744496, 3548272,
+ 2994039, 6511298, 6476982, 1050970,
+ 1333058, 7143142, 3318210, 1430225,
+ 451100, 7067962, 5074302, 1962642,
+ 1279661, 6463336, 2546312, 1374803,
+ 6880252, 7603226, 6144537, 4974386,
+ 542412, 2831860, 1671176, 1846953,
+ 2584293, 3724270, 7786281, 3776993,
+ 2013608, 5948022, 5925962, 164721,
+ 6423145, 5011305, 8194886, 1207385,
+ 3183426, 8217573, 6764025, 5366416,
+ 7570268, 6727783, 3694233, 1799107,
+ 3038916, 4856520, 4513516, 8110657,
+ 6167306, 975884, 6662682, 7908339,
+ 426683, 6656817, 1803090, 6470041,
+ 1667432, 1104333, 260646, 3833893,
+ 2939036, 2235985, 420899, 2286327,
+ 8196974, 976891, 6767575, 3545687,
+ 554416, 4460757, 48306, 1362209,
+ 4442679, 6979993, 846154, 6403635
+};
+
+static const int32_t neg_zetas_montgomery_twisted[256] = {
+ 513, -1830765814, 1929875198, 1927777021,
+ -1640767043, -1477910808, -1612161320, -1640734243,
+ -308362794, 1815525078, 1374673747, 1091570561,
+ 1929495948, -515185417, 285697464, -625853734,
+ -1727305303, -2082316399, 1364982364, -858240903,
+ -1806278032, -222489248, 346752665, -684667771,
+ -1654287830, 878576921, 1257667337, 748618600,
+ -329347124, -1837364258, 1443016192, 1170414140,
+ 1846138266, 1631226337, 1404529460, -1838055108,
+ -1594295555, 1076973524, 1898723372, 594436434,
+ 202001019, 475984260, 561427819, -1797021249,
+ 1061813249, -2059733581, 1661512037, 1104976547,
+ 1750224323, 901666090, -418987549, -1831915353,
+ 1925356482, -992097815, -879957084, -2024403851,
+ -1484874663, 1636082791, 285388939, 1983539118,
+ 1495136973, 950076368, 1714807469, 952438995,
+ 1574918427, 654783359, -1350681039, 1974159335,
+ 2143979939, -1651689965, -1599739334, -140455867,
+ 1285853323, 1039411343, 993005454, -1955560694,
+ 1440787840, -1529189038, -568627424, 2131021879,
+ 783134479, 247357819, 588790217, -1518161566,
+ -289871779, 86965173, 1262003603, -1708872713,
+ -2135294594, -1787797779, 1018755525, -1638590967,
+ 889861155, 120646189, -1665705314, 1669960606,
+ -1321868265, 916321553, -1225434134, -1155548551,
+ 1784632065, -2143745726, -666258755, -1210558297,
+ -675310538, 1261461890, 1555941049, 318346816,
+ 1999506069, -628664287, 1499481952, 1729304568,
+ 695180181, -1422575624, 1375177023, -1424130038,
+ -1777179795, 1185330464, -334803716, -235321233,
+ 178766300, -168022240, 518252220, -1206536194,
+ -1957047970, -985155484, -1146323031, 894060584,
+ 898414, -991903577, -1363007699, -746144247,
+ 1363460238, -912367098, -30313375, 1420958686,
+ 605900044, 44694138, 326425360, -2032221020,
+ -2027833504, -1176904444, -1683520342, -1904936414,
+ -14253661, 421552615, 517299995, -1257750361,
+ -1014493058, 818371958, -2027935492, -1926727420,
+ -863641633, -1747917558, 1372618621, -1931587461,
+ -1819892093, 325927722, -128353682, -1258381762,
+ -2124962072, -908452107, 1123881663, -885133338,
+ 1223601434, -1851023419, -137583814, -1629985059,
+ 1920467228, 1176751720, 635454918, -1967222128,
+ 1637785317, 1354528381, 642772912, -6363717,
+ 1536588520, 72690499, -45766800, 1287922800,
+ -694382729, 314284738, -671509322, -1136965286,
+ -235104446, -985022746, 2070602178, -1779436847,
+ 1045062172, -963438278, -419615362, -1116720494,
+ -831969619, 1078959976, -1216882040, -1042326957,
+ 300448764, -604552166, 270590489, -1405999310,
+ -756955444, 1021949428, 1276805128, -713994583,
+ 260312805, -608791570, -371462359, -940195359,
+ -1554794072, -173440394, 1357098058, 1542497137,
+ -1339088279, 2126092137, 384158534, -2061661095,
+ 2040058690, 1316619237, -827959815, 883155600,
+ 853476188, 1039370343, 596344473, -1726753853,
+ 2047270596, -6087992, -702390548, 1547952705,
+ 1723816714, 110126092, 279505434, -394851341,
+ 1591599803, -565464271, 260424530, -283780711,
+ 440824168, 1758099917, 71875110, -776003547,
+ -1119856484, 1600929361, 1208667171, -1123958025,
+ -1544891538, -879867909, 1499603927, -201262505,
+ -155290192, 1809756373, -2036925262, -1934038751,
+ 973777463, -400711271, 540420426, -374860237
+};
+/* clang-format on */
+
+static const vec_int32_t vec_q = { ML_DSA_Q, ML_DSA_Q, ML_DSA_Q, ML_DSA_Q };
+static const vec_int32_t vec_q_inv = { ML_DSA_Q_INV, ML_DSA_Q_INV, ML_DSA_Q_INV, ML_DSA_Q_INV };
+
+/*
+ * @brief Computes the Montgomery product of a and b.
+ * See [Seiler 2018, Algorithm 3].
+ *
+ * @param a is the first factor, assumed to be non-negative.
+ * @param a_twist is (int32)((uint32)a * ML_DSA_Q_INV).
+ * @param b is the second factor.
+ * @returns The Montgomery product of a and b in the range
+ * -q+1..q-1.
+ */
+
+static ossl_inline
+ vec_int32_t
+ montgomery_multiplication_vectorized(vec_int32_t a, vec_int32_t a_twist, vec_int32_t b)
+{
+ vec_uint32_t k = (vec_uint32_t)a_twist * (vec_uint32_t)b;
+ vec_uint32_t c_u = vec_mulh((vec_uint32_alias_t)k, (vec_uint32_alias_t)vec_q);
+ vec_int32_t c = (vec_int32_t)c_u;
+ vec_int32_t z_high = vec_mulh((vec_int32_alias_t)a, (vec_int32_alias_t)b);
+ vec_int32_t r = z_high - c;
+ return r;
+}
+
+/*
+ * @brief Reduce a in (-q, q) to a mod q in [0, q-1].
+ *
+ * @param a in (-q, q)
+ * @returns a mod q in [0, q-1]
+ */
+static ossl_inline
+ vec_int32_t
+ reduce_once_signed(vec_int32_t a)
+{
+ /* mask is 11..11 when a is negative, else 0 */
+ vec_uint32_t mask = -(((vec_uint32_t)a) >> 31);
+ return a + (vec_int32_t)(mask & (vec_uint32_t)vec_q);
+}
+
+/*
+ * @brief Reduce modulo q to an non-negative vector.
+ * Note that the constant v_scalar equals
+ * floor(2**(floor(log_2(q))-1 * 2**32/q)).
+ *
+ * @param a in the range -2**31..2**31-1
+ * @returns a mod q in the range 0..q-1
+ */
+static ossl_inline
+ vec_int32_t
+ reduce_fully(vec_int32_t a)
+{
+ const int32_t v_scalar = 1074791296;
+ const vec_int32_alias_t v = { v_scalar, v_scalar, v_scalar, v_scalar };
+ vec_int32_t t = vec_mulh((vec_int32_alias_t)a, v) >> 21;
+ t *= ML_DSA_Q;
+ vec_int32_t r = a - t; /* in [0, q] */
+ return reduce_once_signed(r);
+}
+
+void ossl_poly_ntt_mult_scalar_vec128(const POLY *lhs, const POLY *rhs, POLY *out)
+{
+ int i;
+ const vec_int32_t *lhs_vec_ptr = (const vec_int32_t *)lhs->coeff;
+ const vec_int32_t *rhs_vec_ptr = (const vec_int32_t *)rhs->coeff;
+ vec_int32_t *out_vec_ptr = (vec_int32_t *)out->coeff;
+
+ for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS / NUM_INT32_IN_VECTOR; i++) {
+ vec_int32_t twist_vec = (vec_int32_t)((vec_uint32_t)lhs_vec_ptr[i] * (vec_uint32_t)vec_q_inv);
+ vec_int32_t result = montgomery_multiplication_vectorized(
+ lhs_vec_ptr[i], twist_vec, rhs_vec_ptr[i]);
+ out_vec_ptr[i] = reduce_once_signed(result);
+ }
+}
+
+/*
+ * In place number theoretic transform of a given polynomial.
+ *
+ * See FIPS 204, Algorithm 41, NTT()
+ * This function uses montgomery multiplication.
+ *
+ * @param p a polynomial that is used as the input, that is replaced with
+ * the NTT of the polynomial
+ */
+void ossl_ml_dsa_poly_ntt_vec128(POLY *p)
+{
+ int i, j, k;
+ int step;
+ int offset = ML_DSA_NUM_POLY_COEFFICIENTS;
+ vec_int32_t *p_vec = (vec_int32_t *)p->coeff;
+
+ /* Step: 1, 2, 4, 8, ..., 32 */
+ for (step = 1; step < ML_DSA_NUM_POLY_COEFFICIENTS / 4; step <<= 1) {
+ k = 0;
+ offset >>= 1; /* Offset: 128, 64, 32, 16, ..., 4 */
+
+ for (i = 0; i < step; i++) {
+ const vec_int32_t zeta = { zetas_montgomery[step + i],
+ zetas_montgomery[step + i],
+ zetas_montgomery[step + i],
+ zetas_montgomery[step + i] };
+ const vec_int32_t zeta_twisted = { zetas_montgomery_twisted[step + i],
+ zetas_montgomery_twisted[step + i],
+ zetas_montgomery_twisted[step + i],
+ zetas_montgomery_twisted[step + i] };
+
+ for (j = k; j < k + offset; j += NUM_INT32_IN_VECTOR) {
+ vec_int32_t w_even_vec = p_vec[j / NUM_INT32_IN_VECTOR];
+ vec_int32_t w_odd_vec = p_vec[(j + offset) / NUM_INT32_IN_VECTOR];
+ vec_int32_t t_odd_vec = montgomery_multiplication_vectorized(
+ zeta,
+ zeta_twisted,
+ w_odd_vec);
+ vec_int32_t coeff_j_vec = (w_even_vec + t_odd_vec);
+ vec_int32_t coeff_j_offset_vec = (w_even_vec - t_odd_vec);
+ p_vec[j / NUM_INT32_IN_VECTOR] = coeff_j_vec;
+ p_vec[(j + offset) / NUM_INT32_IN_VECTOR] = coeff_j_offset_vec;
+ }
+ k += 2 * offset;
+ }
+ }
+
+ /* offset == 2*/
+ k = 0;
+ step = 64;
+ offset = 2;
+ for (j = 0; j < ML_DSA_NUM_POLY_COEFFICIENTS; j += 2 * NUM_INT32_IN_VECTOR) {
+ const vec_int32_t zeta = {
+ zetas_montgomery[step + j / NUM_INT32_IN_VECTOR],
+ zetas_montgomery[step + j / NUM_INT32_IN_VECTOR],
+ zetas_montgomery[step + j / NUM_INT32_IN_VECTOR + 1],
+ zetas_montgomery[step + j / NUM_INT32_IN_VECTOR + 1]
+ };
+ const vec_int32_t zeta_twisted = {
+ zetas_montgomery_twisted[step + j / NUM_INT32_IN_VECTOR],
+ zetas_montgomery_twisted[step + j / NUM_INT32_IN_VECTOR],
+ zetas_montgomery_twisted[step + j / NUM_INT32_IN_VECTOR + 1],
+ zetas_montgomery_twisted[step + j / NUM_INT32_IN_VECTOR + 1]
+ };
+
+ vec_int32_t w_even_vec = {
+ p_vec[j / NUM_INT32_IN_VECTOR][0],
+ p_vec[j / NUM_INT32_IN_VECTOR][1],
+ p_vec[j / NUM_INT32_IN_VECTOR + 1][0],
+ p_vec[j / NUM_INT32_IN_VECTOR + 1][1]
+ };
+ vec_int32_t w_odd_vec = {
+ p_vec[j / NUM_INT32_IN_VECTOR][2],
+ p_vec[j / NUM_INT32_IN_VECTOR][3],
+ p_vec[j / NUM_INT32_IN_VECTOR + 1][2],
+ p_vec[j / NUM_INT32_IN_VECTOR + 1][3]
+ };
+ vec_int32_t t_odd_vec = montgomery_multiplication_vectorized(
+ zeta,
+ zeta_twisted,
+ w_odd_vec);
+ vec_int32_t coeff_j_vec = (w_even_vec + t_odd_vec);
+ vec_int32_t coeff_j_offset_vec = (w_even_vec - t_odd_vec);
+ p_vec[j / NUM_INT32_IN_VECTOR] = (vec_int32_t) {
+ coeff_j_vec[0],
+ coeff_j_vec[1],
+ coeff_j_offset_vec[0],
+ coeff_j_offset_vec[1]
+ };
+ p_vec[j / NUM_INT32_IN_VECTOR + 1] = (vec_int32_t) {
+ coeff_j_vec[2],
+ coeff_j_vec[3],
+ coeff_j_offset_vec[2],
+ coeff_j_offset_vec[3]
+ };
+ }
+
+ /* offset == 1 */
+ k = 0;
+ step = 128;
+ for (i = 0; i < step; i += NUM_INT32_IN_VECTOR) {
+ const vec_int32_t zeta = {
+ zetas_montgomery[step + i],
+ zetas_montgomery[step + i + 1],
+ zetas_montgomery[step + i + 2],
+ zetas_montgomery[step + i + 3]
+ };
+ const vec_int32_t zeta_twisted = {
+ zetas_montgomery_twisted[step + i],
+ zetas_montgomery_twisted[step + i + 1],
+ zetas_montgomery_twisted[step + i + 2],
+ zetas_montgomery_twisted[step + i + 3]
+ };
+
+ vec_int32_t w_even_vec = {
+ p_vec[k / NUM_INT32_IN_VECTOR][0],
+ p_vec[k / NUM_INT32_IN_VECTOR][2],
+ p_vec[k / NUM_INT32_IN_VECTOR + 1][0],
+ p_vec[k / NUM_INT32_IN_VECTOR + 1][2]
+ };
+ vec_int32_t w_odd_vec = {
+ p_vec[k / NUM_INT32_IN_VECTOR][1],
+ p_vec[k / NUM_INT32_IN_VECTOR][3],
+ p_vec[k / NUM_INT32_IN_VECTOR + 1][1],
+ p_vec[k / NUM_INT32_IN_VECTOR + 1][3]
+ };
+ vec_int32_t t_odd_vec = montgomery_multiplication_vectorized(
+ zeta,
+ zeta_twisted,
+ w_odd_vec);
+ vec_int32_t coeff_j_vec = reduce_fully(w_even_vec + t_odd_vec);
+ vec_int32_t coeff_j_offset_vec = reduce_fully(w_even_vec - t_odd_vec);
+
+ p->coeff[k] = coeff_j_vec[0];
+ p->coeff[k + 2] = coeff_j_vec[1];
+ p->coeff[k + 4] = coeff_j_vec[2];
+ p->coeff[k + 6] = coeff_j_vec[3];
+ p->coeff[k + 1] = coeff_j_offset_vec[0];
+ p->coeff[k + 2 + 1] = coeff_j_offset_vec[1];
+ p->coeff[k + 4 + 1] = coeff_j_offset_vec[2];
+ p->coeff[k + 6 + 1] = coeff_j_offset_vec[3];
+
+ k += 2 * NUM_INT32_IN_VECTOR;
+ }
+}
+
+/*
+ * @brief In place inverse number theoretic transform of a given polynomial.
+ * See FIPS 204, Algorithm 42, NTT^-1()
+ *
+ * @param p a polynomial that is used as the input, that is overwritten with
+ * the inverse of the NTT.
+ */
+void ossl_ml_dsa_poly_ntt_inverse_vec128(POLY *p)
+{
+ /*
+ * Step: 128, 64, 32, 16, ..., 1
+ * Offset: 1, 2, 4, 8, ..., 128
+ */
+ int i, j, k, offset, step = ML_DSA_NUM_POLY_COEFFICIENTS;
+ /*
+ * The multiplicative inverse of 256 mod q, in Montgomery form is
+ * ((256^-1 mod q) * ((2^32 * 2^32) mod q)) mod q = (8347681 * 2365951) mod 8380417
+ */
+ static const int32_t inverse_degree_montgomery = 41978;
+ static const vec_int32_t vec_inverse_degree_montgomery = {
+ inverse_degree_montgomery,
+ inverse_degree_montgomery,
+ inverse_degree_montgomery,
+ inverse_degree_montgomery
+ };
+ static const int32_t inverse_degree_montgomery_twisted = -8395782;
+ static const vec_int32_t vec_inverse_degree_montgomery_twisted = {
+ inverse_degree_montgomery_twisted,
+ inverse_degree_montgomery_twisted,
+ inverse_degree_montgomery_twisted,
+ inverse_degree_montgomery_twisted
+ };
+
+ vec_int32_t *p_vec = (vec_int32_t *)p->coeff;
+
+ offset = 1;
+ step >>= 1;
+ k = 0;
+
+ for (i = 0; i < step; i += NUM_INT32_IN_VECTOR) {
+ /* offset == 1*/
+ const vec_int32_t zeta = { neg_zetas_montgomery[step + (step - 1 - i)],
+ neg_zetas_montgomery[step + (step - 1 - i - 1)],
+ neg_zetas_montgomery[step + (step - 1 - i - 2)],
+ neg_zetas_montgomery[step + (step - 1 - i - 3)] };
+ const vec_int32_t zeta_twisted = { neg_zetas_montgomery_twisted[step + (step - 1 - i)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i - 1)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i - 2)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i - 3)] };
+ vec_int32_t even = { p->coeff[k],
+ p->coeff[k + 2],
+ p->coeff[k + 4],
+ p->coeff[k + 6] };
+ vec_int32_t odd = { p->coeff[k + 1],
+ p->coeff[k + 1 + 2],
+ p->coeff[k + 1 + 4],
+ p->coeff[k + 1 + 6] };
+
+ vec_int32_t coeff_j = (odd + even);
+ vec_int32_t coeff_j_offset = montgomery_multiplication_vectorized(
+ zeta,
+ zeta_twisted,
+ even - odd);
+
+ p->coeff[k + 0] = coeff_j[0];
+ p->coeff[k + 2] = coeff_j[1];
+ p->coeff[k + 4] = coeff_j[2];
+ p->coeff[k + 6] = coeff_j[3];
+ p->coeff[k + 1 + 0] = coeff_j_offset[0];
+ p->coeff[k + 1 + 2] = coeff_j_offset[1];
+ p->coeff[k + 1 + 4] = coeff_j_offset[2];
+ p->coeff[k + 1 + 6] = coeff_j_offset[3];
+
+ k += 2 * NUM_INT32_IN_VECTOR;
+ }
+
+ /* offset == 2 */
+ offset <<= 1;
+ step >>= 1;
+ k = 0;
+
+ for (i = 0; i < step; i += 2) {
+ const vec_int32_t zeta = { neg_zetas_montgomery[step + (step - 1 - i)],
+ neg_zetas_montgomery[step + (step - 1 - i)],
+ neg_zetas_montgomery[step + (step - 1 - i - 1)],
+ neg_zetas_montgomery[step + (step - 1 - i - 1)] };
+ const vec_int32_t zeta_twisted = { neg_zetas_montgomery_twisted[step + (step - 1 - i)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i - 1)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i - 1)] };
+
+ j = k;
+ vec_int32_t even = { p->coeff[j],
+ p->coeff[j + 1],
+ p->coeff[j + 4],
+ p->coeff[j + 5] };
+ vec_int32_t odd = { p->coeff[j + 2],
+ p->coeff[j + 3],
+ p->coeff[j + 6],
+ p->coeff[j + 7] };
+
+ vec_int32_t coeff_j = (odd + even);
+ vec_int32_t coeff_j_offset = montgomery_multiplication_vectorized(
+ zeta,
+ zeta_twisted,
+ even - odd);
+
+ p_vec[j / NUM_INT32_IN_VECTOR] = (vec_int32_t) {
+ coeff_j[0],
+ coeff_j[1],
+ coeff_j_offset[0],
+ coeff_j_offset[1]
+ };
+ p_vec[j / NUM_INT32_IN_VECTOR + 1] = (vec_int32_t) {
+ coeff_j[2],
+ coeff_j[3],
+ coeff_j_offset[2],
+ coeff_j_offset[3]
+ };
+ k += 2 * 2 * offset;
+ }
+
+ /* offset >= 4 */
+ for (offset <<= 1; offset < ML_DSA_NUM_POLY_COEFFICIENTS; offset <<= 1) {
+ step >>= 1;
+ k = 0;
+
+ for (i = 0; i < step; i++) {
+ const vec_int32_t zeta = { neg_zetas_montgomery[step + (step - 1 - i)],
+ neg_zetas_montgomery[step + (step - 1 - i)],
+ neg_zetas_montgomery[step + (step - 1 - i)],
+ neg_zetas_montgomery[step + (step - 1 - i)] };
+ const vec_int32_t zeta_twisted = { neg_zetas_montgomery_twisted[step + (step - 1 - i)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i)],
+ neg_zetas_montgomery_twisted[step + (step - 1 - i)] };
+
+ for (j = k; j < k + offset; j += NUM_INT32_IN_VECTOR) {
+ vec_int32_t even = p_vec[j / NUM_INT32_IN_VECTOR];
+ vec_int32_t odd = p_vec[(j + offset) / NUM_INT32_IN_VECTOR];
+
+ vec_int32_t coeff_j = (odd + even);
+ vec_int32_t coeff_j_offset = montgomery_multiplication_vectorized(
+ zeta,
+ zeta_twisted,
+ even - odd);
+ p_vec[j / NUM_INT32_IN_VECTOR] = coeff_j;
+ p_vec[(j + offset) / NUM_INT32_IN_VECTOR] = coeff_j_offset;
+ }
+ k += 2 * offset;
+ }
+ }
+
+ for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS / NUM_INT32_IN_VECTOR; i += 1) {
+ vec_int32_t coeff_i_vec = montgomery_multiplication_vectorized(
+ vec_inverse_degree_montgomery,
+ vec_inverse_degree_montgomery_twisted,
+ p_vec[i]);
+ p_vec[i] = reduce_once_signed(coeff_i_vec);
+ }
+}
+
+#endif
diff --git a/crypto/ml_dsa/ml_dsa_poly.h b/crypto/ml_dsa/ml_dsa_poly.h
index 061deb147b..7998fdd9c1 100644
--- a/crypto/ml_dsa/ml_dsa_poly.h
+++ b/crypto/ml_dsa/ml_dsa_poly.h
@@ -11,13 +11,18 @@
#include <openssl/crypto.h>
+#include "internal/common.h"
#include "ml_dsa_local.h"
#define ML_DSA_NUM_POLY_COEFFICIENTS 256
/* Polynomial object with 256 coefficients. The coefficients are unsigned 32 bits */
struct poly_st {
+#if defined(VX_COMPILER_SUPPORT_VEC128)
+ ALIGN16 uint32_t coeff[ML_DSA_NUM_POLY_COEFFICIENTS];
+#else
uint32_t coeff[ML_DSA_NUM_POLY_COEFFICIENTS];
+#endif
};
static ossl_inline ossl_unused void
diff --git a/crypto/ml_dsa/ml_dsa_sign.c b/crypto/ml_dsa/ml_dsa_sign.c
index 51c2709ddb..05251a6dd0 100644
--- a/crypto/ml_dsa/ml_dsa_sign.c
+++ b/crypto/ml_dsa/ml_dsa_sign.c
@@ -168,12 +168,14 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
EVP_MD_CTX *md_ctx = NULL;
uint32_t k = (uint32_t)params->k, l = (uint32_t)params->l;
uint32_t gamma1 = params->gamma1, gamma2 = params->gamma2;
- uint8_t *alloc = NULL, *w1_encoded;
+ uint8_t *alloc = NULL, *w1_encoded = NULL;
+ void *alloc_freeptr = NULL;
size_t alloc_len, w1_encoded_len;
size_t num_polys_sig_k = 2 * k;
size_t num_polys_k = 5 * k;
size_t num_polys_l = 3 * l;
size_t num_polys_k_by_l = k * l;
+ size_t poly_count;
POLY *p, *c_ntt;
VECTOR s1_ntt, s2_ntt, t0_ntt, w, w1, cs1, cs2, y;
MATRIX a_ntt;
@@ -188,23 +190,25 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
return 0;
}
- /*
- * Allocate a single blob for most of the variable size temporary variables.
- * Mostly used for VECTOR POLYNOMIALS (every POLY is 1K).
- */
+ /* Allocate w1_encoded buffer */
w1_encoded_len = k * (gamma2 == ML_DSA_GAMMA2_Q_MINUS1_DIV88 ? 192 : 128);
- alloc_len = w1_encoded_len
- + sizeof(*p) * (1 + num_polys_k + num_polys_l + num_polys_k_by_l + num_polys_sig_k);
- alloc = OPENSSL_malloc(alloc_len);
- if (alloc == NULL)
+ w1_encoded = OPENSSL_malloc(w1_encoded_len);
+ if (w1_encoded == NULL)
return 0;
+
+ /* Allocate aligned POLY array */
+ poly_count = 1 + num_polys_k + num_polys_l + num_polys_k_by_l + num_polys_sig_k;
+ alloc_len = sizeof(*p) * poly_count;
+ alloc = OPENSSL_aligned_alloc(alloc_len, 16, &alloc_freeptr);
+ if (alloc == NULL)
+ goto err;
+
md_ctx = EVP_MD_CTX_new();
if (md_ctx == NULL)
goto err;
- w1_encoded = alloc;
- /* Init the temp vectors to point to the allocated polys blob */
- p = (POLY *)(w1_encoded + w1_encoded_len);
+ /* Init the temp vectors to point to the aligned polys blob */
+ p = (POLY *)alloc;
c_ntt = p++;
matrix_init(&a_ntt, p, k, l);
p += num_polys_k_by_l;
@@ -346,7 +350,14 @@ static int ml_dsa_sign_internal(const ML_DSA_KEY *priv,
}
err:
EVP_MD_CTX_free(md_ctx);
- OPENSSL_clear_free(alloc, alloc_len);
+ if (alloc_freeptr != NULL) {
+ /* Clear the actual sensitive buffer */
+ if (alloc != NULL)
+ OPENSSL_cleanse(alloc, alloc_len);
+ OPENSSL_free(alloc_freeptr);
+ }
+ if (w1_encoded != NULL)
+ OPENSSL_clear_free(w1_encoded, w1_encoded_len);
OPENSSL_cleanse(rho_prime, sizeof(rho_prime));
/*
* Declassify the private key material before returning. The key struct
@@ -380,7 +391,8 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub,
const uint8_t *sig_enc, size_t sig_enc_len)
{
int ret = 0;
- uint8_t *alloc = NULL, *w1_encoded;
+ uint8_t *alloc = NULL, *w1_encoded = NULL;
+ void *alloc_freeptr = NULL;
POLY *p, *c_ntt;
MATRIX a_ntt;
VECTOR az_ntt, ct1_ntt, *z_ntt, *w1, *w_approx;
@@ -394,6 +406,8 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub,
size_t num_polys_k = 2 * k;
size_t num_polys_l = 1 * l;
size_t num_polys_k_by_l = k * l;
+ size_t poly_count;
+ size_t alloc_len;
uint8_t c_tilde[ML_DSA_MAX_LAMBDA / 4];
uint8_t c_tilde_sig[ML_DSA_MAX_LAMBDA / 4];
EVP_MD_CTX *md_ctx = NULL;
@@ -406,19 +420,25 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub,
return 0;
}
- /* Allocate space for all the POLYNOMIALS used by temporary VECTORS */
+ /* Allocate w1_encoded buffer */
w1_encoded_len = k * (gamma2 == ML_DSA_GAMMA2_Q_MINUS1_DIV88 ? 192 : 128);
- alloc = OPENSSL_malloc(w1_encoded_len
- + sizeof(*p) * (1 + num_polys_k + num_polys_l + num_polys_k_by_l + num_polys_sig));
- if (alloc == NULL)
+ w1_encoded = OPENSSL_malloc(w1_encoded_len);
+ if (w1_encoded == NULL)
return 0;
+
+ /* Allocate aligned POLY array */
+ poly_count = 1 + num_polys_k + num_polys_l + num_polys_k_by_l + num_polys_sig;
+ alloc_len = sizeof(*p) * poly_count;
+ alloc = OPENSSL_aligned_alloc(alloc_len, 16, &alloc_freeptr);
+ if (alloc == NULL)
+ goto err;
+
md_ctx = EVP_MD_CTX_new();
if (md_ctx == NULL)
goto err;
- w1_encoded = alloc;
- /* Init the temp vectors to point to the allocated polys blob */
- p = (POLY *)(w1_encoded + w1_encoded_len);
+ /* Init the temp vectors to point to the aligned polys blob */
+ p = (POLY *)alloc;
c_ntt = p++;
matrix_init(&a_ntt, p, k, l);
p += num_polys_k_by_l;
@@ -463,7 +483,9 @@ static int ml_dsa_verify_internal(const ML_DSA_KEY *pub,
ret = (z_max < (uint32_t)(params->gamma1 - params->beta))
&& memcmp(c_tilde, sig.c_tilde, c_tilde_len) == 0;
err:
- OPENSSL_free(alloc);
+ if (alloc_freeptr != NULL)
+ OPENSSL_free(alloc_freeptr);
+ OPENSSL_free(w1_encoded);
EVP_MD_CTX_free(md_ctx);
return ret;
}
diff --git a/include/arch/s390x_arch.h b/include/arch/s390x_arch.h
index f5ff696dcd..95c01dd2dc 100644
--- a/include/arch/s390x_arch.h
+++ b/include/arch/s390x_arch.h
@@ -204,4 +204,9 @@ extern int OPENSSL_s390xcex_nodev;
#define S390X_KMAC_IIMP 0x4000
#define S390X_KMAC_CCUP 0x2000
+/* Are the s390x vector instructions (VX and VXE) supported? */
+/* This is the case for >= z13 (VX) and >= z14 (VXE). */
+#define S390X_VX_CAPABLE ( \
+ (OPENSSL_s390xcap_P.stfle[2] & S390X_CAPBIT(S390X_VX)) && (OPENSSL_s390xcap_P.stfle[2] & S390X_CAPBIT(S390X_VXE)))
+
#endif
diff --git a/include/internal/common.h b/include/internal/common.h
index d709b78c22..c29acccb37 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -27,12 +27,15 @@
#endif
#if defined(__GNUC__) || defined(__clang__)
+#define ALIGN16 __attribute((aligned(16)))
#define ALIGN32 __attribute((aligned(32)))
#define ALIGN64 __attribute((aligned(64)))
#elif defined(_MSC_VER)
+#define ALIGN16 __declspec(align(16))
#define ALIGN32 __declspec(align(32))
#define ALIGN64 __declspec(align(64))
#else
+#define ALIGN16
#define ALIGN32
#define ALIGN64
#endif