Commit a2079b82ba for openssl.org

commit a2079b82ba9b78f8a9e76c2f3387c39c0b44d28a
Author: Viktor Dukhovni <openssl-users@dukhovni.org>
Date:   Thu Apr 16 21:41:07 2026 +1000

    Drop value barrier from ML-DSA reduce_once

    This mirrors the corresponding code in ML-KEM and works under
    the same conditions/assumptions.  Also adjusted related
    functions with unnecessary 2-layers of constant_time selects
    where one suffices (now also matching BoringSSL).

    Intentionally uses the constant time instrumentation PR as its
    merge-base, so to be merged after than has baked in for a few
    days and shows working CT tests in daily CI runs.

    Sample before/after performance pairs and percent throughput
    increases for one X86_64 CPU:

                  keygens/s    sign/s  verify/s
        ML-DSA-44   18728.3    6061.2   23251.6
        ML-DSA-44   21077.2    7392.4   27244.3
        ML-DSA-44     12.5%     22.0%     17.2%

        ML-DSA-65   10084.3    3603.0   13988.6
        ML-DSA-65   11197.9    4549.7   16208.4
        ML-DSA-65     11.0%     26.3%     15.9%

        ML-DSA-87    7184.8    2917.3    8141.0
        ML-DSA-87    8132.4    3693.7    9430.7
        ML-DSA-87     13.2%     26.6%     15.8%

    and here's the same for an Apple silicon M2:

                  keygens/s    sign/s  verify/s
        ML-DSA-44   17235.7    3099.3   15744.5
        ML-DSA-44   21855.2    4907.6   22849.0
        ML-DSA-44     26.8%     58.3%     45.1%

        ML-DSA-65    9165.8    1908.5   10058.3
        ML-DSA-65   11262.7    3069.6   14348.1
        ML-DSA-65     22.9%     60.8%     42.6%

        ML-DSA-87    6596.1    1563.6    6330.8
        ML-DSA-87    8404.9    2584.6    8767.6
        ML-DSA-87     27.4%     65.3%     38.5%

    Reviewed-by: Eugene Syromiatnikov <esyr@openssl.org>
    Reviewed-by: Nikola Pajkovsky <nikolap@openssl.org>
    MergeDate: Thu Apr 23 13:55:05 2026
    (Merged from https://github.com/openssl/openssl/pull/30864)

diff --git a/crypto/ml_dsa/ml_dsa_local.h b/crypto/ml_dsa/ml_dsa_local.h
index d4f63f7e99..bbaa6dafc7 100644
--- a/crypto/ml_dsa/ml_dsa_local.h
+++ b/crypto/ml_dsa/ml_dsa_local.h
@@ -101,20 +101,26 @@ int ossl_ml_dsa_poly_decode_expand_mask(POLY *out,
     const uint8_t *in, size_t in_len,
     uint32_t gamma1);

-/*
- * @brief Reduces x mod q in constant time
+/*-
+ * @brief Reduces 0 <= x < 2*q, mod q.
  * i.e. return x < q ? x : x - q;
  *
- * @param x Where x is assumed to be in the range 0 <= x < 2*q
+ * Subtract |q| if the input is larger, without exposing a side-channel,
+ * avoiding the "clangover" attack.  See |constish_time_true| for a discussion
+ * on why the value barrier is by default omitted.
+ *
  * @returns the difference in the range 0..q-1
  */
-static ossl_inline ossl_unused uint32_t reduce_once(uint32_t x)
+static ossl_inline ossl_unused __owur uint32_t reduce_once(uint32_t x)
 {
-    return constant_time_select_32(constant_time_lt_32(x, ML_DSA_Q), x, x - ML_DSA_Q);
+    const uint32_t subtracted = x - ML_DSA_Q;
+    uint32_t mask = constish_time_true(subtracted >> 31);
+
+    return (mask & x) | (~mask & subtracted);
 }

 /*
- * @brief Calculate The positive value of (a-b) mod q in constant time.
+ * @brief Calculates the positive value of (a-b) mod q in constant time.
  *
  * a - b mod q gives a value in the range -(q-1)..(q-1)
  * By adding q we get a range of 1..(2q-1).
@@ -131,21 +137,25 @@ static ossl_inline ossl_unused uint32_t mod_sub(uint32_t a, uint32_t b)

 /*
  * @brief Returns the absolute value in constant time.
- * i.e. return is_positive(x) ? x : -x;
+ * i.e.  return is_negative(x) ? -x : x;
  */
 static ossl_inline ossl_unused uint32_t abs_signed(uint32_t x)
 {
-    return constant_time_select_32(constant_time_lt_32(x, 0x80000000), x, 0u - x);
+    uint32_t mask = 0u - (x >> 31);
+
+    return constant_time_select_32(mask, 0u - x, x);
 }

 /*
  * @brief Returns the absolute value modulo q in constant time
- * i.e return x > (q - 1) / 2 ? q - x : x;
+ * i.e return x <= (q-1)/2 ? x : q - x;
  */
 static ossl_inline ossl_unused uint32_t abs_mod_prime(uint32_t x)
 {
-    return constant_time_select_32(constant_time_lt_32(ML_DSA_Q_MINUS1_DIV2, x),
-        ML_DSA_Q - x, x);
+    uint32_t mask = x - ML_DSA_Q_MINUS1_DIV2;
+
+    mask = 0u - (mask >> 31);
+    return constant_time_select_32(mask, x, ML_DSA_Q - x);
 }

 /*
@@ -154,7 +164,9 @@ static ossl_inline ossl_unused uint32_t abs_mod_prime(uint32_t x)
  */
 static ossl_inline ossl_unused uint32_t maximum(uint32_t x, uint32_t y)
 {
-    return constant_time_select_int(constant_time_lt(x, y), y, x);
+    uint32_t mask = x - y;
+    mask = 0u - (mask >> 31);
+    return constant_time_select_int(mask, y, x);
 }

 #endif /* OSSL_CRYPTO_ML_DSA_LOCAL_H */