Commit 3a69b19028 for openssl.org

commit 3a69b1902892883d81c41747b2230c5168511026
Author: Nick Nuon <myjpntrip@gmail.com>
Date:   Wed Nov 5 19:32:25 2025 -0500

    Added AVX2 encoding + scalar improvements

    Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com>
    Reviewed-by: Paul Dale <paul.dale@oracle.com>
    (Merged from https://github.com/openssl/openssl/pull/29178)

diff --git a/crypto/evp/bio_b64.c b/crypto/evp/bio_b64.c
index de95a57057..ff5c4e41ed 100644
--- a/crypto/evp/bio_b64.c
+++ b/crypto/evp/bio_b64.c
@@ -41,6 +41,8 @@ typedef struct b64_struct {
     EVP_ENCODE_CTX *base64;
     unsigned char buf[EVP_ENCODE_LENGTH(B64_BLOCK_SIZE) + 10];
     unsigned char tmp[B64_BLOCK_SIZE];
+    unsigned char *encoded_buf;
+    size_t encoded_buf_len;
 } BIO_B64_CTX;

 static const BIO_METHOD methods_b64 = {
@@ -72,6 +74,8 @@ static int b64_new(BIO *bi)

     ctx->cont = 1;
     ctx->start = 1;
+    ctx->encoded_buf = NULL;
+    ctx->encoded_buf_len = 0;
     ctx->base64 = EVP_ENCODE_CTX_new();
     if (ctx->base64 == NULL) {
         OPENSSL_free(ctx);
@@ -95,6 +99,9 @@ static int b64_free(BIO *a)
     if (ctx == NULL)
         return 0;

+    OPENSSL_free(ctx->encoded_buf);
+    ctx->encoded_buf = NULL;
+    ctx->encoded_buf_len = 0;
     EVP_ENCODE_CTX_free(ctx->base64);
     OPENSSL_free(ctx);
     BIO_set_data(a, NULL);
@@ -379,95 +386,36 @@ static int b64_write(BIO *b, const char *in, int inl)
     if (in == NULL || inl <= 0)
         return 0;

-    while (inl > 0) {
-        n = inl > B64_BLOCK_SIZE ? B64_BLOCK_SIZE : inl;
+    int encoded_length = EVP_ENCODE_LENGTH(inl);

-        if ((BIO_get_flags(b) & BIO_FLAGS_BASE64_NO_NL) != 0) {
-            if (ctx->tmp_len > 0) {
-                if (!ossl_assert(ctx->tmp_len <= 3)) {
-                    ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                    return ret == 0 ? -1 : ret;
-                }
-                n = 3 - ctx->tmp_len;
-                /*
-                 * There's a theoretical possibility for this
-                 */
-                if (n > inl)
-                    n = inl;
-                memcpy(&(ctx->tmp[ctx->tmp_len]), in, n);
-                ctx->tmp_len += n;
-                ret += n;
-                if (ctx->tmp_len < 3)
-                    break;
-                ctx->buf_len = EVP_EncodeBlock(ctx->buf, ctx->tmp, ctx->tmp_len);
-                if (!ossl_assert(ctx->buf_len <= (int)sizeof(ctx->buf))) {
-                    ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                    return ret == 0 ? -1 : ret;
-                }
-                if (!ossl_assert(ctx->buf_len >= ctx->buf_off)) {
-                    ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                    return ret == 0 ? -1 : ret;
-                }
-                /*
-                 * Since we're now done using the temporary buffer, the
-                 * length should be 0'd
-                 */
-                ctx->tmp_len = 0;
-            } else {
-                if (n < 3) {
-                    memcpy(ctx->tmp, in, n);
-                    ctx->tmp_len = n;
-                    ret += n;
-                    break;
-                }
-                n -= n % 3;
-                ctx->buf_len = EVP_EncodeBlock(ctx->buf, (unsigned char *)in, n);
-                if (!ossl_assert(ctx->buf_len <= (int)sizeof(ctx->buf))) {
-                    ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                    return ret == 0 ? -1 : ret;
-                }
-                if (!ossl_assert(ctx->buf_len >= ctx->buf_off)) {
-                    ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                    return ret == 0 ? -1 : ret;
-                }
-                ret += n;
-            }
-        } else {
-            if (!EVP_EncodeUpdate(ctx->base64, ctx->buf, &ctx->buf_len,
-                    (unsigned char *)in, n))
-                return ret == 0 ? -1 : ret;
-            if (!ossl_assert(ctx->buf_len <= (int)sizeof(ctx->buf))) {
-                ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                return ret == 0 ? -1 : ret;
-            }
-            if (!ossl_assert(ctx->buf_len >= ctx->buf_off)) {
-                ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                return ret == 0 ? -1 : ret;
-            }
-            ret += n;
+    if (ctx->encoded_buf == NULL || (size_t)encoded_length > ctx->encoded_buf_len) {
+        OPENSSL_free(ctx->encoded_buf);
+        ctx->encoded_buf = OPENSSL_malloc(encoded_length);
+        if (ctx->encoded_buf == NULL) {
+            ERR_raise(ERR_LIB_BIO, ERR_R_MALLOC_FAILURE);
+            return -1;
         }
-        inl -= n;
-        in += n;
+        ctx->encoded_buf_len = encoded_length;
+    }

-        ctx->buf_off = 0;
-        n = ctx->buf_len;
-        while (n > 0) {
-            i = BIO_write(next, &(ctx->buf[ctx->buf_off]), n);
-            if (i <= 0) {
-                BIO_copy_next_retry(b);
-                return ret == 0 ? i : ret;
-            }
-            n -= i;
-            ctx->buf_off += i;
-            if (!ossl_assert(ctx->buf_off <= (int)sizeof(ctx->buf))) {
-                ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                return ret == 0 ? -1 : ret;
-            }
-            if (!ossl_assert(ctx->buf_len >= ctx->buf_off)) {
-                ERR_raise(ERR_LIB_BIO, ERR_R_INTERNAL_ERROR);
-                return ret == 0 ? -1 : ret;
-            }
-        }
+    unsigned char *encoded = ctx->encoded_buf;
+
+    if (encoded == NULL) {
+        ERR_raise(ERR_LIB_BIO, ERR_R_MALLOC_FAILURE);
+        return -1;
+    }
+    int n_bytes_enc = 0;
+    if (!EVP_EncodeUpdate(ctx->base64, encoded, &n_bytes_enc,
+            (unsigned char *)in, inl)) {
+        if (ret == 0)
+            return -1;
+        return ret;
+    }
+    ret += inl;
+    i = BIO_write(next, encoded, n_bytes_enc);
+    if (i <= 0) {
+        BIO_copy_next_retry(b);
+        return ret == 0 ? i : ret;
         ctx->buf_len = 0;
         ctx->buf_off = 0;
     }
diff --git a/crypto/evp/build.info b/crypto/evp/build.info
index 45945afcab..5897acd943 100644
--- a/crypto/evp/build.info
+++ b/crypto/evp/build.info
@@ -8,6 +8,8 @@ $COMMON=digest.c evp_enc.c evp_lib.c evp_fetch.c evp_utils.c \

 SOURCE[../../libcrypto]=$COMMON\
         encode.c evp_key.c evp_cnf.c \
+        enc_b64_scalar.c \
+        enc_b64_avx2.c \
         e_des.c e_bf.c e_idea.c e_des3.c \
         e_rc4.c e_aes.c names.c e_aria.c e_sm4.c \
         e_xcbc_d.c e_rc2.c e_cast.c e_rc5.c m_null.c \
diff --git a/crypto/evp/enc_b64_avx2.c b/crypto/evp/enc_b64_avx2.c
new file mode 100644
index 0000000000..b3f46e66ae
--- /dev/null
+++ b/crypto/evp/enc_b64_avx2.c
@@ -0,0 +1,661 @@
+#include <openssl/evp.h>
+#include "enc_b64_scalar.h"
+#include "enc_b64_avx2.h"
+#include "internal/cryptlib.h"
+#include "crypto/evp.h"
+#include "evp_local.h"
+
+#if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+#define STRINGIFY_IMPLEMENTATION_(a) #a
+#define STRINGIFY(a) STRINGIFY_IMPLEMENTATION_(a)
+
+#ifdef __clang__
+/*
+ * clang does not have GCC push pop
+ * warning: clang attribute push can't be used within a namespace in clang up
+ * til 8.0 so OPENSSL_TARGET_REGION and OPENSSL_UNTARGET_REGION must be
+ * outside* of a namespace.
+ */
+#define OPENSSL_TARGET_REGION(T)                                       \
+    _Pragma(STRINGIFY(clang attribute push(__attribute__((target(T))), \
+        apply_to = function)))
+#define OPENSSL_UNTARGET_REGION _Pragma("clang attribute pop")
+#elif defined(__GNUC__)
+#define OPENSSL_TARGET_REGION(T) \
+    _Pragma("GCC push_options") _Pragma(STRINGIFY(GCC target(T)))
+#define OPENSSL_UNTARGET_REGION _Pragma("GCC pop_options")
+#endif /* clang then gcc */
+
+/* Default target region macros don't do anything. */
+#ifndef OPENSSL_TARGET_REGION
+#define OPENSSL_TARGET_REGION(T)
+#define OPENSSL_UNTARGET_REGION
+#endif
+
+#define OPENSSL_TARGET_AVX2 \
+    OPENSSL_TARGET_REGION("avx2")
+#define OPENSSL_UNTARGET_AVX2 OPENSSL_UNTARGET_REGION
+
+/*
+ * Ensure this whole block is compiled with AVX2 enabled on GCC.
+ * Clang/MSVC will just ignore these pragmas.
+ */
+
+#include <string.h>
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+
+OPENSSL_TARGET_AVX2
+static __m256i lookup_pshufb_std(__m256i input)
+{
+    __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
+    const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
+
+    result = _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
+    __m256i shift_LUT = _mm256_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62,
+        '/' - 63, 'A', 0, 0,
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62,
+        '/' - 63, 'A', 0, 0);
+
+    result = _mm256_shuffle_epi8(shift_LUT, result);
+    return _mm256_add_epi8(result, input);
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline __m256i lookup_pshufb_srp(__m256i input)
+{
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i hi = _mm256_set1_epi8((char)0x80);
+    __m256i invalid = _mm256_or_si256(_mm256_cmpgt_epi8(zero, input),
+        _mm256_cmpgt_epi8(input,
+            _mm256_set1_epi8(63)));
+    __m256i idx = _mm256_setzero_si256();
+
+    idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(9)));
+    idx = _mm256_sub_epi8(idx, _mm256_cmpgt_epi8(input, _mm256_set1_epi8(35)));
+    idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(3),
+        _mm256_cmpeq_epi8(input, _mm256_set1_epi8(62)));
+    idx = _mm256_blendv_epi8(idx, _mm256_set1_epi8(4),
+        _mm256_cmpeq_epi8(input, _mm256_set1_epi8(63)));
+
+    /* Zero-out invalid lanes via PSHUFB's high-bit mechanism */
+    idx = _mm256_or_si256(idx, _mm256_and_si256(invalid, hi));
+
+    const __m256i shift_LUT = _mm256_setr_epi8('0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0,
+        '0' - 0, 'A' - 10, 'a' - 36, '.' - 62, '/' - 63, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+    __m256i shift = _mm256_shuffle_epi8(shift_LUT, idx);
+    __m256i ascii = _mm256_add_epi8(shift, input);
+    return ascii;
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline __m256i shift_right_zeros(__m256i v, int n)
+{
+    switch (n) {
+    case 0:
+        return v;
+    case 1:
+        return _mm256_srli_si256(v, 1);
+    case 2:
+        return _mm256_srli_si256(v, 2);
+    case 3:
+        return _mm256_srli_si256(v, 3);
+    case 4:
+        return _mm256_srli_si256(v, 4);
+    case 5:
+        return _mm256_srli_si256(v, 5);
+    case 6:
+        return _mm256_srli_si256(v, 6);
+    case 7:
+        return _mm256_srli_si256(v, 7);
+    case 8:
+        return _mm256_srli_si256(v, 8);
+    case 9:
+        return _mm256_srli_si256(v, 9);
+    case 10:
+        return _mm256_srli_si256(v, 10);
+    case 11:
+        return _mm256_srli_si256(v, 11);
+    case 12:
+        return _mm256_srli_si256(v, 12);
+    case 13:
+        return _mm256_srli_si256(v, 13);
+    case 14:
+        return _mm256_srli_si256(v, 14);
+    case 15:
+        return _mm256_srli_si256(v, 15);
+    default:
+        return _mm256_setzero_si256();
+    }
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline __m256i shift_left_zeros(__m256i v, int n)
+{
+    switch (n) {
+    case 0:
+        return v;
+    case 1:
+        return _mm256_slli_si256(v, 1);
+    case 2:
+        return _mm256_slli_si256(v, 2);
+    case 3:
+        return _mm256_slli_si256(v, 3);
+    case 4:
+        return _mm256_slli_si256(v, 4);
+    case 5:
+        return _mm256_slli_si256(v, 5);
+    case 6:
+        return _mm256_slli_si256(v, 6);
+    case 7:
+        return _mm256_slli_si256(v, 7);
+    case 8:
+        return _mm256_slli_si256(v, 8);
+    case 9:
+        return _mm256_slli_si256(v, 9);
+    case 10:
+        return _mm256_slli_si256(v, 10);
+    case 11:
+        return _mm256_slli_si256(v, 11);
+    case 12:
+        return _mm256_slli_si256(v, 12);
+    case 13:
+        return _mm256_slli_si256(v, 13);
+    case 14:
+        return _mm256_slli_si256(v, 14);
+    case 15:
+        return _mm256_slli_si256(v, 15);
+    case 16:
+        return _mm256_setzero_si256();
+    default:
+        return _mm256_setzero_si256();
+    }
+}
+OPENSSL_UNTARGET_AVX2
+
+static const uint8_t shuffle_masks[16][16] = {
+    { 0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80 }
+};
+
+/**
+ * Insert a line feed character in the 64-byte input at index K in [0,32).
+ */
+OPENSSL_TARGET_AVX2
+static inline __m256i insert_line_feed32(__m256i input, int K)
+{
+    __m256i line_feed_vector = _mm256_set1_epi8('\n');
+    __m128i identity = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+    if (K >= 16) {
+        __m128i maskhi = _mm_loadu_si128((__m128i *)shuffle_masks[K - 16]);
+        __m256i mask = _mm256_set_m128i(maskhi, identity);
+        __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80));
+        __m256i shuffled = _mm256_shuffle_epi8(input, mask);
+        __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
+
+        return result;
+    }
+    /* Shift input right by 1 byte */
+    __m256i shift = _mm256_alignr_epi8(input, _mm256_permute2x128_si256(input, input, 0x21),
+        15);
+    input = _mm256_blend_epi32(input, shift, 0xF0);
+    __m128i masklo = _mm_loadu_si128((__m128i *)shuffle_masks[K]);
+    __m256i mask = _mm256_set_m128i(identity, masklo);
+    __m256i lf_pos = _mm256_cmpeq_epi8(mask, _mm256_set1_epi8((char)0x80));
+    __m256i shuffled = _mm256_shuffle_epi8(input, mask);
+    __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
+    return result;
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline size_t ins_nl_gt32(__m256i v, uint8_t *out, int stride,
+    int *wrap_cnt)
+{
+    const int until_nl = stride - *wrap_cnt;
+
+    if (until_nl > 32) {
+        _mm256_storeu_si256((__m256i *)out, v);
+
+        *wrap_cnt += 32;
+        return 32;
+    }
+
+    if (until_nl == 32) {
+        _mm256_storeu_si256((__m256i *)out, v);
+
+        out[32] = '\n';
+        *wrap_cnt = 0;
+        return 33;
+    }
+
+    const uint8_t last = (uint8_t)_mm256_extract_epi8(v, 31);
+    const __m256i with_lf = insert_line_feed32(v, until_nl);
+    _mm256_storeu_si256((__m256i *)out, with_lf);
+    out[32] = last;
+
+    *wrap_cnt = 32 - until_nl;
+    return 33;
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline size_t insert_nl_gt16(const __m256i v0,
+    uint8_t *output,
+    int wrap_max, int *wrap_cnt)
+{
+    uint8_t *out = output;
+    int wrap_rem = wrap_max - *wrap_cnt;
+    _mm256_storeu_si256((__m256i *)(output), v0);
+
+    if (wrap_rem > 32) {
+        *wrap_cnt += 32;
+        return 32;
+    }
+
+    __m256i all_ff_mask = _mm256_set1_epi8((char)0xFF);
+
+    __m256i mask_second_lane = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
+        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
+        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF,
+        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF);
+
+    __m256i blended_0L = v0;
+    int surplus_0 = wrap_rem < 16 ? 1 : 0;
+    if (surplus_0 == 1) {
+        __m256i shifted_0_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem),
+            wrap_rem + surplus_0);
+        __m256i mask_shifted_0_L = shift_left_zeros(all_ff_mask, wrap_rem + surplus_0);
+        __m256i mask = _mm256_or_si256(mask_shifted_0_L, mask_second_lane);
+        __m256i shifted_1_L = shift_left_zeros(v0, 1);
+        __m256i shifted = _mm256_blendv_epi8(shifted_0_L, shifted_1_L, mask);
+
+        blended_0L = _mm256_blendv_epi8(v0, shifted, mask);
+        _mm256_storeu_si256((__m256i *)(output), blended_0L);
+        wrap_rem += wrap_max;
+    }
+
+    int surplus_1 = (wrap_rem >= 16 && wrap_rem < 32) ? 1 : 0;
+    int last_of_1L = _mm256_extract_epi8(v0, 31);
+
+    if (surplus_1 == 1) {
+        uint16_t sec_last_of_1L = _mm256_extract_epi8(v0, 30);
+        int wrap_rem_1 = wrap_rem - 16;
+        __m256i shifted_1_L = shift_left_zeros(shift_right_zeros(v0, wrap_rem_1),
+            wrap_rem_1 + surplus_0 + surplus_1);
+        __m256i mask_shifted_1_L = shift_left_zeros(all_ff_mask, wrap_rem_1 + surplus_0 + surplus_1);
+        __m256i mask = _mm256_and_si256(mask_second_lane, mask_shifted_1_L);
+        __m256i blended_1L = _mm256_blendv_epi8(blended_0L, shifted_1_L, mask);
+        _mm256_storeu_si256((__m256i *)(output), blended_1L);
+
+        output[wrap_rem + surplus_0] = '\n';
+        output[31 + surplus_0] = (uint8_t)sec_last_of_1L;
+        output[31 + surplus_0 + surplus_1] = last_of_1L;
+    }
+
+    if (surplus_0 == 1) {
+        output[wrap_rem - wrap_max] = '\n';
+        output[16] = _mm256_extract_epi8(v0, 15);
+        output[31 + surplus_0 + surplus_1] = last_of_1L;
+    }
+
+    *wrap_cnt = wrap_rem > 32 ? 32 - (wrap_rem - wrap_max) : 32 - wrap_rem;
+
+    int nl_at_end = 0;
+    if (*wrap_cnt == wrap_max || *wrap_cnt == 0) {
+        *wrap_cnt = 0;
+        output[32 + surplus_0 + surplus_1] = '\n';
+        nl_at_end = 1;
+    }
+
+    out += 32 + surplus_0 + surplus_1 + nl_at_end;
+    size_t written = (size_t)(out - output);
+
+    return written;
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline size_t insert_nl_2nd_vec_stride_12(const __m256i v0,
+    uint8_t *output,
+    int dummy_stride,
+    int *wrap_cnt)
+{
+    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+        (char)0xFF,
+        (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, (char)0xFF,
+        12);
+    __m256i shuffled = _mm256_shuffle_epi8(v0, shuffling_mask);
+
+    _mm256_storeu_si256((__m256i *)(output + 0), shuffled);
+
+    int16_t rem_1_L_ext = _mm256_extract_epi16(v0, 7);
+    int8_t rem_2_L_ext_P1 = _mm256_extract_epi8(v0, 29);
+    int16_t rem_2_L_ext_P2 = _mm256_extract_epi16(v0, 15);
+
+    uint8_t *out = output;
+    out[4] = '\n';
+    memcpy(out + 15, &rem_1_L_ext, sizeof(rem_1_L_ext));
+    out[16 + 1] = '\n';
+    memcpy(out + 15 + 17, &rem_2_L_ext_P1, sizeof(rem_2_L_ext_P1));
+    out[16 + 14] = '\n';
+    memcpy(out + 15 + 17 + 1, &rem_2_L_ext_P2, sizeof(rem_2_L_ext_P2));
+
+    out += 32 + 3;
+    *wrap_cnt = 4;
+
+    size_t written = (out - output);
+    return written;
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline __m256i insert_newlines_by_mask(__m256i data, __m256i mask)
+{
+    __m256i newline = _mm256_set1_epi8('\n');
+
+    return _mm256_or_si256(_mm256_and_si256(mask, newline),
+        _mm256_andnot_si256(mask, data));
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline size_t insert_nl_str4(const __m256i v0, uint8_t *output)
+{
+    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, (char)0xFF, 4, 5, 6,
+        7, (char)0xFF, 8, 9, 10, 11, (char)0xFF, 12,
+        (char)0xFF, (char)0xFF, (char)0xFF, (char)0xFF, 0, 1, 2, 3,
+        (char)0xFF, 4, 5, 6, 7, (char)0xFF, 8, 9);
+    __m256i mask_5_bytes = _mm256_setr_epi8(0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
+        0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
+        0, 0, 0, 0, (char)0xFF, 0, 0, 0, 0, (char)0xFF,
+        0, 0);
+    __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask);
+    __m256i v0_w_nl = insert_newlines_by_mask(shuffled_4_bytes, mask_5_bytes);
+
+    _mm256_storeu_si256((__m256i *)(output + 0), v0_w_nl);
+
+    /* Handle cross-lane remainder logic */
+    /* Without macros, _mm256_srli_si256 complains that the last arg must be an 8-bit immediate */
+#define B_LANE 16 /* Bytes per lane */
+#define N_RET_1_L 3 /* bytes "shifted out" of lane 0 */
+#define N_RET_2_L (N_RET_1_L + 4) /* bytes "shifted out" of lane 1 */
+
+    /* Bytes that were shifted out of lane 0 */
+    __m256i rem_1_L = _mm256_srli_si256(v0, B_LANE - N_RET_1_L);
+
+    /* Bytes that were shifted out of lane 1 */
+    __m256i rem_2_L_P1 = _mm256_srli_si256(_mm256_slli_si256(_mm256_srli_si256(v0, B_LANE - N_RET_2_L),
+                                               B_LANE - N_RET_1_L),
+        B_LANE - 2);
+
+    /* isolate the bytes that were shifted out of lane 1 */
+    __m256i rem_2_L_P2 = _mm256_slli_si256(
+        _mm256_srli_si256(v0,
+            B_LANE - N_RET_2_L + N_RET_1_L),
+        N_RET_1_L);
+
+    __m256i rem_2_L = _mm256_or_si256(rem_2_L_P1, rem_2_L_P2);
+
+    int32_t rem_1_L_ext = _mm256_extract_epi32(rem_1_L, 0);
+    int64_t rem_2_L_ext = _mm256_extract_epi64(rem_2_L, 2);
+
+    uint8_t *out = output + 16;
+    memcpy(out, &rem_1_L_ext, sizeof(rem_1_L_ext));
+    out += 3;
+    *out++ = '\n';
+
+    out = output + 32;
+    memcpy(out, &rem_2_L_ext, sizeof(rem_2_L_ext));
+    out += 2;
+    *out++ = '\n';
+    out += 4;
+    *out++ = '\n';
+
+    size_t written = (out - output);
+    return written;
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+static inline size_t insert_nl_str8(const __m256i v0, uint8_t *output)
+{
+    __m256i shuffling_mask = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, (char)0xFF,
+        8, 9, 10, 11, 12, 13, 14,
+        (char)0xFF, (char)0xFF, 0, 1, 2, 3, 4, 5, 6,
+        7, (char)0xFF, 8, 9, 10, 11, 12);
+    __m256i shuffled_4_bytes = _mm256_shuffle_epi8(v0, shuffling_mask);
+    _mm256_storeu_si256((__m256i *)(output), shuffled_4_bytes);
+    int8_t rem_1_L = _mm256_extract_epi8(v0, 15);
+    int8_t rem_2_L_P1 = _mm256_extract_epi8(v0, 29);
+    int16_t rem_2_L_P2 = _mm256_extract_epi16(v0, 15);
+    uint8_t *out = output;
+
+    memcpy(out + 16, &rem_1_L, sizeof(rem_1_L));
+    memcpy(out + 32, &rem_2_L_P1, sizeof(rem_2_L_P1));
+    memcpy(out + 32 + 1, &rem_2_L_P2, sizeof(rem_2_L_P2));
+
+    output[8] = '\n';
+    output[17] = '\n';
+    output[26] = '\n';
+    output[35] = '\n';
+
+    out += 32 + 4;
+
+    size_t written = (out - output);
+    return written;
+}
+OPENSSL_UNTARGET_AVX2
+
+OPENSSL_TARGET_AVX2
+int encode_base64_avx2(EVP_ENCODE_CTX *ctx, unsigned char *dst,
+    const unsigned char *src, int srclen, int ctx_length,
+    int *final_wrap_cnt)
+{
+    const uint8_t *input = (const uint8_t *)src;
+    uint8_t *out = (uint8_t *)dst;
+    int i = 0;
+    int stride = (ctx == NULL) ? 0 : ctx_length / 3 * 4;
+    int wrap_cnt = 0;
+    const int use_srp = (ctx != NULL
+        && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0);
+    const __m256i shuf = _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
+        10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
+    int base = 0;
+
+    /* Process 96 bytes at a time */
+    for (; i + 100 <= srclen; i += 96) {
+        /* We shave off 4 bytes from the beginning and the end */
+        const __m128i lo0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 0));
+        const __m128i hi0 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 1));
+        const __m128i lo1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 2));
+        const __m128i hi1 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 3));
+        const __m128i lo2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 4));
+        const __m128i hi2 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 5));
+        const __m128i lo3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 6));
+        const __m128i hi3 = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3 * 7));
+        __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
+        __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
+        __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
+        __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
+        const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
+        const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
+        const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
+        const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
+        const __m256i t1_0 = _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
+        const __m256i t1_1 = _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
+        const __m256i t1_2 = _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
+        const __m256i t1_3 = _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
+        const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
+        const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
+        const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
+        const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
+        const __m256i t3_0 = _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
+        const __m256i t3_1 = _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
+        const __m256i t3_2 = _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
+        const __m256i t3_3 = _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
+        const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
+        const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
+        const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
+        const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
+        __m256i vec0;
+        __m256i vec1;
+        __m256i vec2;
+        __m256i vec3;
+
+        if (use_srp) {
+            vec0 = lookup_pshufb_srp(input0);
+            vec1 = lookup_pshufb_srp(input1);
+            vec2 = lookup_pshufb_srp(input2);
+            vec3 = lookup_pshufb_srp(input3);
+
+        } else {
+            vec0 = lookup_pshufb_std(input0);
+            vec1 = lookup_pshufb_std(input1);
+            vec2 = lookup_pshufb_std(input2);
+            vec3 = lookup_pshufb_std(input3);
+        }
+
+        if (stride == 0) {
+            _mm256_storeu_si256((__m256i *)out, vec0);
+
+            out += 32;
+            _mm256_storeu_si256((__m256i *)out, vec1);
+
+            out += 32;
+            _mm256_storeu_si256((__m256i *)out, vec2);
+
+            out += 32;
+            _mm256_storeu_si256((__m256i *)out, vec3);
+
+            out += 32;
+        } else if (stride == 64) {
+            _mm256_storeu_si256((__m256i *)out, vec0);
+
+            out += 32;
+            _mm256_storeu_si256((__m256i *)out, vec1);
+
+            out += 32;
+            *(out++) = '\n';
+
+            _mm256_storeu_si256((__m256i *)out, vec2);
+            out += 32;
+
+            _mm256_storeu_si256((__m256i *)out, vec3);
+            out += 32;
+
+            *(out++) = '\n';
+        } else if (stride == 4) {
+            int out_idx = 0;
+
+            out_idx += (int)insert_nl_str4(vec0, out + out_idx);
+            out_idx += (int)insert_nl_str4(vec1, out + out_idx);
+            out_idx += (int)insert_nl_str4(vec2, out + out_idx);
+            out_idx += (int)insert_nl_str4(vec3, out + out_idx);
+
+            out += out_idx;
+        } else if (stride == 8) {
+
+            out += insert_nl_str8(vec0, out);
+            out += insert_nl_str8(vec1, out);
+            out += insert_nl_str8(vec2, out);
+            out += insert_nl_str8(vec3, out);
+
+        } else if (stride == 12) {
+            switch (base) {
+            case 0:
+
+                out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
+                out += insert_nl_2nd_vec_stride_12(vec1, out, stride, &wrap_cnt);
+                out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
+                out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
+                break;
+            case 1:
+                out += insert_nl_2nd_vec_stride_12(vec0, out, stride, &wrap_cnt);
+                out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
+                out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
+                out += insert_nl_2nd_vec_stride_12(vec3, out, stride, &wrap_cnt);
+                break;
+            default: /* base == 2 */
+                out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
+                out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
+                out += insert_nl_2nd_vec_stride_12(vec2, out, stride, &wrap_cnt);
+                out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
+                break;
+            }
+
+            if (++base == 3)
+                base = 0;
+        } else if (stride >= 32) {
+            out += ins_nl_gt32(vec0, out, stride, &wrap_cnt);
+            out += ins_nl_gt32(vec1, out, stride, &wrap_cnt);
+            out += ins_nl_gt32(vec2, out, stride, &wrap_cnt);
+            out += ins_nl_gt32(vec3, out, stride, &wrap_cnt);
+        } else if (stride >= 16) {
+            out += insert_nl_gt16(vec0, out, stride, &wrap_cnt);
+            out += insert_nl_gt16(vec1, out, stride, &wrap_cnt);
+            out += insert_nl_gt16(vec2, out, stride, &wrap_cnt);
+            out += insert_nl_gt16(vec3, out, stride, &wrap_cnt);
+        }
+    }
+
+    if (stride == 0) {
+        for (; i + 28 <= srclen; i += 24) {
+            /* lo = [xxxx|DDDC|CCBB|BAAA] */
+            /* hi = [xxxx|HHHG|GGFF|FEEE] */
+            const __m128i lo = _mm_loadu_si128((const __m128i *)(input + i));
+            const __m128i hi = _mm_loadu_si128((const __m128i *)(input + i + 4 * 3));
+            /*
+             * bytes from groups A, B and C are needed in separate 32-bit lanes
+             * in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
+             */
+            __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
+            const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
+            const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+            const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
+            const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+            const __m256i indices = _mm256_or_si256(t1, t3);
+            _mm256_storeu_si256((__m256i *)out, (use_srp ? lookup_pshufb_srp : lookup_pshufb_std)(indices));
+
+            out += 32;
+        }
+    }
+    *final_wrap_cnt = wrap_cnt;
+
+    if (stride >= 32 && wrap_cnt == stride) {
+        wrap_cnt = 0;
+        *out++ = '\n';
+    }
+
+    return (int)(out - (uint8_t *)dst) + +evp_encodeblock_int(ctx, out, src + i, srclen - i, final_wrap_cnt);
+}
+OPENSSL_UNTARGET_AVX2
+#endif
diff --git a/crypto/evp/enc_b64_avx2.h b/crypto/evp/enc_b64_avx2.h
new file mode 100644
index 0000000000..9c871ac13c
--- /dev/null
+++ b/crypto/evp/enc_b64_avx2.h
@@ -0,0 +1,12 @@
+#ifndef OSSL_CRYPTO_EVP_B64_AVX2_H
+#define OSSL_CRYPTO_EVP_B64_AVX2_H
+
+#include <openssl/evp.h>
+
+#if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+int encode_base64_avx2(EVP_ENCODE_CTX *ctx,
+    unsigned char *out, const unsigned char *src, int srclen,
+    int newlines, int *wrap_cnt);
+#endif
+
+#endif
diff --git a/crypto/evp/enc_b64_scalar.c b/crypto/evp/enc_b64_scalar.c
new file mode 100644
index 0000000000..d7b10f648b
--- /dev/null
+++ b/crypto/evp/enc_b64_scalar.c
@@ -0,0 +1,280 @@
+#include <openssl/evp.h>
+#include "internal/cryptlib.h"
+#include "crypto/evp.h"
+#include "evp_local.h"
+#include "enc_b64_scalar.h"
+
+static const unsigned char base64_srp_bin2ascii_0[256] = {
+    '0', '0', '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3',
+    '4', '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
+    '8', '8', '8', '8', '9', '9', '9', '9', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B',
+    'C', 'C', 'C', 'C', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F',
+    'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J',
+    'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N',
+    'O', 'O', 'O', 'O', 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R',
+    'S', 'S', 'S', 'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V',
+    'W', 'W', 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z',
+    'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
+    'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h', 'h',
+    'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', 'l', 'l',
+    'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p', 'p', 'p', 'p',
+    'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', 't', 't', 't', 't',
+    'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w', 'w', 'x', 'x', 'x', 'x',
+    'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '.', '.', '.', '.', '/', '/', '/', '/'
+};
+
+static const unsigned char base64_srp_bin2ascii_1[256] = {
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/'
+};
+
+static const unsigned char base64_srp_bin2ascii_2[256] = {
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
+    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', '/'
+};
+
+static const unsigned char base64_std_bin2ascii_0[256] = {
+    'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
+    'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
+    'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
+    'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
+    'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
+    'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
+    'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
+    'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
+    'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
+    'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
+    'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
+    'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
+    't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
+    'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
+    '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
+    '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
+    '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', '+', '+', '/', '/', '/',
+    '/'
+};
+
+static const unsigned char base64_std_bin2ascii_1[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
+    '/'
+};
+
+static const unsigned char base64_std_bin2ascii_2[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
+    '/'
+};
+
+int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t,
+    const unsigned char *f, int dlen, int *wrap_cnt)
+{
+    int i = 0;
+    int ret = 0;
+    uint8_t t1, t2, t3;
+    const unsigned char *e0, *e1, *e2;
+    int srp = (ctx != NULL
+        && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0);
+    int wrap_cnt_by_input = *wrap_cnt / 4 * 3;
+    const int ctx_length = (ctx != NULL) ? ctx->length : 0;
+
+    if (srp) {
+        e0 = base64_srp_bin2ascii_0;
+        e1 = base64_srp_bin2ascii_1;
+        e2 = base64_srp_bin2ascii_2;
+    } else {
+        e0 = base64_std_bin2ascii_0;
+        e1 = base64_std_bin2ascii_1;
+        e2 = base64_std_bin2ascii_2;
+    }
+
+    if (ctx_length == 1) {
+        while (i < dlen && ret <= INT_MAX && ctx != NULL) {
+            t1 = f[i];
+            *(t++) = e0[t1];
+            *(t++) = e1[(t1 & 0x03) << 4];
+            *(t++) = '=';
+            *(t++) = '=';
+            *(t++) = '\n';
+
+            ret += 5;
+            i++;
+        }
+
+        *t = '\0';
+        ret--;
+
+        return ret;
+    } else if (ctx_length % 3 != 0) {
+        i = 0;
+        int wrap_cnt_nm3 = 0;
+        while (i + 2 < dlen && ret <= INT_MAX) {
+            if (ctx != NULL) {
+                if ((wrap_cnt_nm3 < ctx->length
+                        && (wrap_cnt_nm3 + 3 + wrap_cnt_by_input) > ctx->length)
+                    && ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0)) {
+
+                    switch (ctx->length % 3) {
+                    case 0:
+                        break;
+                    case 1:
+                        t1 = f[i];
+                        *(t++) = e0[t1];
+                        *(t++) = e1[(t1 & 0x03) << 4];
+                        *(t++) = '=';
+                        *(t++) = '=';
+
+                        ret += 4;
+                        i++;
+                        break;
+                    case 2:
+                        t1 = f[i];
+                        t2 = f[i + 1];
+                        *(t++) = e0[t1];
+                        *(t++) = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+                        *(t++) = e2[(t2 & 0x0F) << 2];
+                        *(t++) = '=';
+                        i += 2;
+                        ret += 4;
+                        break;
+                    }
+                    *(t++) = '\n';
+                    ret++;
+                    wrap_cnt_nm3 = 0;
+                }
+            }
+
+            if (ctx_length >= 4 && i + 2 < dlen) {
+                t1 = f[i];
+                t2 = f[i + 1];
+                t3 = f[i + 2];
+                *(t++) = e0[t1];
+                *(t++) = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+                *(t++) = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+                *(t++) = e2[t3];
+                ret += 4;
+                wrap_cnt_nm3 += 3;
+                i += 3;
+            }
+        }
+    } else {
+        for (i = 0; i + 2 < dlen && ret <= INT_MAX; i += 3) {
+
+            t1 = f[i];
+            t2 = f[i + 1];
+            t3 = f[i + 2];
+            *(t++) = e0[t1];
+            *(t++) = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+            *(t++) = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+            *(t++) = e2[t3];
+            ret += 4;
+
+            if (ctx != NULL) {
+                if ((i + 3 + wrap_cnt_by_input) % ctx->length == 0 && ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) && ctx->length % 3 == 0) {
+                    *(t++) = '\n';
+                    ret++;
+                }
+            }
+        }
+    }
+
+    switch (dlen - i) {
+    case 0:
+        break;
+    case 1:
+        t1 = f[i];
+        *(t++) = e0[t1];
+        *(t++) = e1[(t1 & 0x03) << 4];
+        *(t++) = '=';
+        *(t++) = '=';
+
+        ret += 4;
+
+        if (ctx != NULL) {
+            if ((i + 1 + wrap_cnt_by_input) % ctx->length == 0 && ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) && ctx->length % 3 == 0) {
+                *(t++) = '\n';
+                ret++;
+            }
+        }
+
+        break;
+    case 2:
+        t1 = f[i];
+        t2 = f[i + 1];
+        *(t++) = e0[t1];
+        *(t++) = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+        *(t++) = e2[(t2 & 0x0F) << 2];
+        *(t++) = '=';
+        ret += 4;
+
+        if (ctx != NULL) {
+            if ((i + 2 + wrap_cnt_by_input) % ctx->length == 0 && ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) && ctx->length % 3 == 0) {
+                *(t++) = '\n';
+                ret++;
+            }
+        }
+        break;
+    }
+
+    *t = '\0';
+
+    return ret;
+}
diff --git a/crypto/evp/enc_b64_scalar.h b/crypto/evp/enc_b64_scalar.h
new file mode 100644
index 0000000000..91d416f758
--- /dev/null
+++ b/crypto/evp/enc_b64_scalar.h
@@ -0,0 +1,8 @@
+#ifndef OSSL_CRYPTO_EVP_B64_SCALAR_H
+#define OSSL_CRYPTO_EVP_B64_SCALAR_H
+#include <openssl/evp.h>
+
+int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t,
+    const unsigned char *f, int dlen, int *wrap_cnt);
+
+#endif
diff --git a/crypto/evp/encode.c b/crypto/evp/encode.c
index 53575e7b60..bbd36c0820 100644
--- a/crypto/evp/encode.c
+++ b/crypto/evp/encode.c
@@ -14,25 +14,20 @@
 #include "crypto/evp.h"
 #include "evp_local.h"

+#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && (defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64))
+
+#define HAS_IA32CAP_IS_64
+#endif
+
+#include "enc_b64_avx2.h"
+#include "enc_b64_scalar.h"
+
 static unsigned char conv_ascii2bin(unsigned char a,
     const unsigned char *table);
-static int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t,
-    const unsigned char *f, int dlen);
+int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t,
+    const unsigned char *f, int dlen, int *wrap_cnt);
 static int evp_decodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t,
     const unsigned char *f, int n, int eof);
-
-#ifndef CHARSET_EBCDIC
-#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f])
-#else
-/*
- * We assume that PEM encoded files are EBCDIC files (i.e., printable text
- * files). Convert them here while decoding. When encoding, output is EBCDIC
- * (text) format again. (No need for conversion in the conv_bin2ascii macro,
- * as the underlying textstring data_bin2ascii[] is already EBCDIC)
- */
-#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f])
-#endif
-
 /*-
  * 64 char lines
  * pad input with 0
@@ -45,11 +40,6 @@ static int evp_decodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t,
 #define CHUNKS_PER_LINE (64 / 4)
 #define CHAR_PER_LINE (64 + 1)

-static const unsigned char data_bin2ascii[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-/* SRP uses a different base64 alphabet */
-static const unsigned char srpdata_bin2ascii[65] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./";
-
 /*-
  * 0xF0 is a EOLN
  * 0xF1 is ignore but next needs to be 0xF0 (for \r\n processing).
@@ -400,28 +390,52 @@ int EVP_EncodeUpdate(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl,
         memcpy(&(ctx->enc_data[ctx->num]), in, i);
         in += i;
         inl -= i;
-        j = evp_encodeblock_int(ctx, out, ctx->enc_data, ctx->length);
+        int wrap_cnt = 0;
+        j = evp_encodeblock_int(ctx, out, ctx->enc_data, ctx->length,
+            &wrap_cnt);
         ctx->num = 0;
         out += j;
         total = j;
-        if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) {
-            *(out++) = '\n';
-            total++;
-        }
         *out = '\0';
     }
-    while (inl >= ctx->length && total <= INT_MAX) {
-        j = evp_encodeblock_int(ctx, out, in, ctx->length);
-        in += ctx->length;
-        inl -= ctx->length;
-        out += j;
-        total += j;
-        if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) {
-            *(out++) = '\n';
-            total++;
+    int wrap_cnt = 0;
+    if (ctx->length % 3 != 0) {
+        j = evp_encodeblock_int(ctx, out, in, inl - (inl % ctx->length),
+            &wrap_cnt);
+    } else {
+#if defined(__AVX2__)
+        const int newlines = !(ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) ? ctx->length : 0;
+
+        j = encode_base64_avx2(ctx,
+            (unsigned char *)out,
+            (const unsigned char *)in,
+            inl - (inl % ctx->length), newlines, &wrap_cnt);
+#elif defined(HAS_IA32CAP_IS_64)
+        if ((OPENSSL_ia32cap_P[2] & (1u << 5)) != 0) {
+            const int newlines = !(ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) ? ctx->length : 0;
+
+            j = encode_base64_avx2(ctx,
+                (unsigned char *)out,
+                (const unsigned char *)in,
+                inl - (inl % ctx->length), newlines, &wrap_cnt);
+        } else {
+            j = evp_encodeblock_int(ctx, out, in, inl - (inl % ctx->length),
+                &wrap_cnt);
         }
-        *out = '\0';
+#else
+        j = evp_encodeblock_int(ctx, out, in, inl - (inl % ctx->length),
+            &wrap_cnt);
+#endif
+    }
+    in += inl - (inl % ctx->length);
+    inl -= inl - (inl % ctx->length);
+    out += j;
+    total += j;
+    if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0 && ctx->length % 3 != 0) {
+        *(out++) = '\n';
+        total++;
     }
+    *out = '\0';
     if (total > INT_MAX) {
         /* Too much output data! */
         *outl = 0;
@@ -438,9 +452,11 @@ int EVP_EncodeUpdate(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl,
 void EVP_EncodeFinal(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl)
 {
     unsigned int ret = 0;
+    int wrap_cnt = 0;

     if (ctx->num != 0) {
-        ret = evp_encodeblock_int(ctx, out, ctx->enc_data, ctx->num);
+        ret = evp_encodeblock_int(ctx, out, ctx->enc_data, ctx->num,
+            &wrap_cnt);
         if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0)
             out[ret++] = '\n';
         out[ret] = '\0';
@@ -449,46 +465,20 @@ void EVP_EncodeFinal(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl)
     *outl = ret;
 }

-static int evp_encodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t,
-    const unsigned char *f, int dlen)
+int EVP_EncodeBlock(unsigned char *t, const unsigned char *f, int dlen)
 {
-    int i, ret = 0;
-    unsigned long l;
-    const unsigned char *table;
+    int wrap_cnt = 0;

-    if (ctx != NULL && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0)
-        table = srpdata_bin2ascii;
+#if defined(__AVX2__)
+    return encode_base64_avx2(NULL, t, f, dlen, 0, &wrap_cnt);
+#elif defined(HAS_IA32CAP_IS_64)
+    if ((OPENSSL_ia32cap_P[2] & (1u << 5)) != 0)
+        return encode_base64_avx2(NULL, t, f, dlen, 0, &wrap_cnt);
     else
-        table = data_bin2ascii;
-
-    for (i = dlen; i > 0; i -= 3) {
-        if (i >= 3) {
-            l = (((unsigned long)f[0]) << 16L) | (((unsigned long)f[1]) << 8L) | f[2];
-            *(t++) = conv_bin2ascii(l >> 18L, table);
-            *(t++) = conv_bin2ascii(l >> 12L, table);
-            *(t++) = conv_bin2ascii(l >> 6L, table);
-            *(t++) = conv_bin2ascii(l, table);
-        } else {
-            l = ((unsigned long)f[0]) << 16L;
-            if (i == 2)
-                l |= ((unsigned long)f[1] << 8L);
-
-            *(t++) = conv_bin2ascii(l >> 18L, table);
-            *(t++) = conv_bin2ascii(l >> 12L, table);
-            *(t++) = (i == 1) ? '=' : conv_bin2ascii(l >> 6L, table);
-            *(t++) = '=';
-        }
-        ret += 4;
-        f += 3;
-    }
-
-    *t = '\0';
-    return ret;
-}
-
-int EVP_EncodeBlock(unsigned char *t, const unsigned char *f, int dlen)
-{
-    return evp_encodeblock_int(NULL, t, f, dlen);
+        return evp_encodeblock_int(NULL, t, f, dlen, &wrap_cnt);
+#else
+    return evp_encodeblock_int(NULL, t, f, dlen, &wrap_cnt);
+#endif
 }

 void EVP_DecodeInit(EVP_ENCODE_CTX *ctx)
diff --git a/test/build.info b/test/build.info
index 6890544f21..c400ad0ef3 100644
--- a/test/build.info
+++ b/test/build.info
@@ -55,7 +55,7 @@ IF[{- !$disabled{tests} -}]
           ssl_test_ctx_test ssl_test x509aux cipherlist_test asynciotest \
           bio_callback_test bio_memleak_test bio_core_test bio_dgram_test param_build_test \
           bioprinttest sslapitest ssl_handshake_rtt_test dtlstest sslcorrupttest \
-          bio_base64_test bio_enc_test pkey_meth_test pkey_meth_kdf_test evp_kdf_test uitest \
+          bio_base64_test test_base64_simdutf bio_enc_test pkey_meth_test pkey_meth_kdf_test evp_kdf_test uitest \
           cipherbytes_test threadstest_fips threadpool_test \
           asn1_encode_test asn1_decode_test asn1_string_table_test asn1_stable_parse_test \
           x509_time_test x509_dup_cert_test x509_check_cert_pkey_test \
@@ -640,6 +640,10 @@ IF[{- !$disabled{tests} -}]
   INCLUDE[bio_base64_test]=../include ../apps/include
   DEPEND[bio_base64_test]=../libcrypto libtestutil.a

+  SOURCE[test_base64_simdutf] = test_base64_simdutf.c
+  INCLUDE[test_base64_simdutf] = ../include ../apps/include ../crypto/include ../crypto/evp/
+  DEPEND[test_base64_simdutf] = ../libcrypto libtestutil.a
+
   SOURCE[bio_enc_test]=bio_enc_test.c
   INCLUDE[bio_enc_test]=../include ../apps/include
   DEPEND[bio_enc_test]=../libcrypto libtestutil.a
diff --git a/test/recipes/90-test_base64_simdutf.t b/test/recipes/90-test_base64_simdutf.t
new file mode 100644
index 0000000000..7b3790be22
--- /dev/null
+++ b/test/recipes/90-test_base64_simdutf.t
@@ -0,0 +1,11 @@
+#! /usr/bin/env perl
+# Copyright 2025 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+use OpenSSL::Test::Simple;
+
+simple_test("b64_simdutf", "test_base64_simdutf", "b64_simdutf");
\ No newline at end of file
diff --git a/test/test_base64_simdutf.c b/test/test_base64_simdutf.c
new file mode 100644
index 0000000000..a39155dc44
--- /dev/null
+++ b/test/test_base64_simdutf.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright 2025 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <stdio.h>
+#include "testutil.h"
+#include <openssl/evp.h>
+#include "internal/cryptlib.h"
+#include "crypto/evp.h"
+#include "evp_local.h"
+
+#define MAX_INPUT_LEN 3000
+
+static void fuzz_fill_encode_ctx(EVP_ENCODE_CTX *ctx, int max_fill)
+{
+    static int seeded = 0;
+
+    if (!seeded) {
+        srand((unsigned)time(NULL));
+        seeded = 1;
+    }
+
+    int num = rand() % (max_fill + 1);
+    ctx->num = num;
+
+    for (int i = 0; i < num; i++)
+        ctx->enc_data[i] = (unsigned char)(rand() & 0xFF);
+    ctx->length = (rand() % 80) + 1;
+    ctx->line_num = rand() % (ctx->length + 1);
+}
+static inline uint32_t next_u32(uint32_t *state)
+{
+    *state = (*state * 1664525u) + 1013904223u;
+    return *state;
+}
+
+static const unsigned char data_bin2ascii[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+/* SRP uses a different base64 alphabet */
+static const unsigned char srpdata_bin2ascii[65] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./";
+
+#ifndef CHARSET_EBCDIC
+#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f])
+#else
+/*
+ * We assume that PEM encoded files are EBCDIC files (i.e., printable text
+ * files). Convert them here while decoding. When encoding, output is EBCDIC
+ * (text) format again. (No need for conversion in the conv_bin2ascii macro,
+ * as the underlying textstring data_bin2ascii[] is already EBCDIC)
+ */
+#define conv_bin2ascii(a, table) ((table)[(a) & 0x3f])
+#endif
+
+static int evp_encodeblock_int_old(EVP_ENCODE_CTX *ctx, unsigned char *t,
+    const unsigned char *f, int dlen)
+{
+    int i, ret = 0;
+    unsigned long l;
+    const unsigned char *table;
+
+    if (ctx != NULL && (ctx->flags & EVP_ENCODE_CTX_USE_SRP_ALPHABET) != 0)
+        table = srpdata_bin2ascii;
+    else
+        table = data_bin2ascii;
+
+    for (i = dlen; i > 0; i -= 3) {
+        if (i >= 3) {
+            l = (((unsigned long)f[0]) << 16L) | (((unsigned long)f[1]) << 8L) | f[2];
+            *(t++) = conv_bin2ascii(l >> 18L, table);
+            *(t++) = conv_bin2ascii(l >> 12L, table);
+            *(t++) = conv_bin2ascii(l >> 6L, table);
+            *(t++) = conv_bin2ascii(l, table);
+        } else {
+            l = ((unsigned long)f[0]) << 16L;
+            if (i == 2)
+                l |= ((unsigned long)f[1] << 8L);
+
+            *(t++) = conv_bin2ascii(l >> 18L, table);
+            *(t++) = conv_bin2ascii(l >> 12L, table);
+            *(t++) = (i == 1) ? '=' : conv_bin2ascii(l >> 6L, table);
+            *(t++) = '=';
+        }
+        ret += 4;
+        f += 3;
+    }
+
+    *t = '\0';
+    return ret;
+}
+static int evp_encodeupdate_old(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl,
+    const unsigned char *in, int inl)
+{
+    int i, j;
+    int total = 0;
+
+    *outl = 0;
+    if (inl <= 0)
+        return 0;
+    OPENSSL_assert(ctx->length <= (int)sizeof(ctx->enc_data));
+    if (ctx->length - ctx->num > inl) {
+        memcpy(&(ctx->enc_data[ctx->num]), in, inl);
+        ctx->num += inl;
+        return 1;
+    }
+    if (ctx->num != 0) {
+        i = ctx->length - ctx->num;
+        memcpy(&(ctx->enc_data[ctx->num]), in, i);
+        in += i;
+        inl -= i;
+        j = evp_encodeblock_int_old(ctx, out, ctx->enc_data, ctx->length);
+        ctx->num = 0;
+        out += j;
+        total = j;
+        if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) {
+            *(out++) = '\n';
+            total++;
+        }
+        *out = '\0';
+    }
+    while (inl >= ctx->length && total <= INT_MAX) {
+        j = evp_encodeblock_int_old(ctx, out, in, ctx->length);
+        in += ctx->length;
+        inl -= ctx->length;
+        out += j;
+        total += j;
+        if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0) {
+            *(out++) = '\n';
+            total++;
+        }
+        *out = '\0';
+    }
+    if (total > INT_MAX) {
+        /* Too much output data! */
+        *outl = 0;
+        return 0;
+    }
+    if (inl != 0)
+        memcpy(&(ctx->enc_data[0]), in, inl);
+    ctx->num = inl;
+    *outl = total;
+
+    return 1;
+}
+
+static void evp_encodefinal_old(EVP_ENCODE_CTX *ctx, unsigned char *out, int *outl)
+{
+    unsigned int ret = 0;
+
+    if (ctx->num != 0) {
+        ret = evp_encodeblock_int_old(ctx, out, ctx->enc_data, ctx->num);
+        if ((ctx->flags & EVP_ENCODE_CTX_NO_NEWLINES) == 0)
+            out[ret++] = '\n';
+        out[ret] = '\0';
+        ctx->num = 0;
+    }
+    *outl = ret;
+}
+static int test_encode_line_lengths_reinforced(void)
+{
+    const int trials = 50;
+    uint32_t seed = 12345;
+    /* Generous output buffers (Update + Final + newlines), plus a guard byte */
+    unsigned char out_simd[9000 * 2 + 1] = { 0 };
+    unsigned char out_ref[9000 * 2 + 1] = { 0 };
+
+    for (int t = 0; t < trials; t++) {
+        uint32_t r = next_u32(&seed);
+        int inl = r % MAX_INPUT_LEN;
+        /* Fresh random input */
+        unsigned char input[MAX_INPUT_LEN];
+
+        for (int i = 0; i < inl; i++)
+            input[i] = (unsigned char)(r % 256);
+
+        for (int partial_ctx_fill = 0; partial_ctx_fill <= 80;
+            partial_ctx_fill += 1) {
+            for (int ctx_len = 1; ctx_len <= 80; ctx_len += 1) {
+                printf("Trial %d, input length %d, ctx length %d, partial ctx fill %d\n",
+                    t + 1, inl, ctx_len, partial_ctx_fill);
+                EVP_ENCODE_CTX *ctx_simd = EVP_ENCODE_CTX_new();
+                EVP_ENCODE_CTX *ctx_ref = EVP_ENCODE_CTX_new();
+
+                fuzz_fill_encode_ctx(ctx_simd, partial_ctx_fill);
+
+                memset(out_simd, 0xCC, sizeof(out_simd)); /* poison to catch short writes */
+                memset(out_ref, 0xDD, sizeof(out_ref));
+
+                int outlen_simd = 0, outlen_ref = 0; /* bytes produced by Update */
+                int finlen_simd = 0, finlen_ref = 0; /* bytes produced by Final */
+
+                if (!ctx_simd || !ctx_ref) {
+                    EVP_ENCODE_CTX_free(ctx_simd);
+                    EVP_ENCODE_CTX_free(ctx_ref);
+                    TEST_error("Out of memory for contexts");
+                    return 0;
+                }
+
+                EVP_EncodeInit(ctx_simd);
+                EVP_EncodeInit(ctx_ref);
+                ctx_simd->length = ctx_len;
+                ctx_ref->length = ctx_len;
+
+                for (int i = 0; i < 2; i++) {
+                    if (i % 2 == 0) {
+                        /* Turn SRP alphabet OFF */
+                        ctx_simd->flags &= ~EVP_ENCODE_CTX_USE_SRP_ALPHABET;
+                        ctx_ref->flags &= ~EVP_ENCODE_CTX_USE_SRP_ALPHABET;
+                    } else {
+                        /* Turn SRP alphabet ON */
+                        ctx_simd->flags |= EVP_ENCODE_CTX_USE_SRP_ALPHABET;
+                        ctx_ref->flags |= EVP_ENCODE_CTX_USE_SRP_ALPHABET;
+                    }
+
+                    int ret_simd = EVP_EncodeUpdate(ctx_simd, out_simd, &outlen_simd,
+                        input, (int)inl);
+                    int ret_ref = evp_encodeupdate_old(ctx_ref, out_ref, &outlen_ref,
+                        input, (int)inl);
+
+                    if (!TEST_int_eq(ret_simd, ret_ref)
+                        || !TEST_mem_eq(out_ref, outlen_ref, out_simd, outlen_simd)
+                        || !TEST_int_eq(outlen_simd, outlen_ref))
+                        return 0;
+
+                    EVP_EncodeFinal(ctx_simd, out_simd + outlen_simd,
+                        &finlen_simd);
+                    evp_encodefinal_old(ctx_ref, out_ref + outlen_ref,
+                        &finlen_ref);
+
+                    int total_ref = outlen_ref + finlen_ref;
+                    int total_simd = outlen_simd + finlen_simd;
+
+                    if (!TEST_int_eq(finlen_simd, finlen_ref)
+                        || !TEST_mem_eq(out_ref, total_ref, out_simd, total_simd))
+                        return 0;
+                }
+
+                EVP_ENCODE_CTX_free(ctx_simd);
+                EVP_ENCODE_CTX_free(ctx_ref);
+            }
+        }
+    }
+
+    return 1;
+}
+
+int setup_tests(void)
+{
+    ADD_TEST(test_encode_line_lengths_reinforced);
+
+    return 1;
+}