Dev news

Commit 2657697b6d for openssl.org

commit 2657697b6d29ea1d62161bdd96f4b06bb8e532b2
Author: Iakov Polyak <iakov.polyak@linaro.org>
Date:   Fri Sep 5 11:19:33 2025 +0100

    crypto/poly1305: Add SVE2 vector-length agnostic implementation.

    Implement Poly1305 using SVE2 VLA instructions for AArch64.

    This implementation is selected at runtime if SVE2 is present and the vector length is 256, 512, 1024 or 2048 bits.

    Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
    Reviewed-by: Paul Dale <paul.dale@oracle.com>
    (Merged from https://github.com/openssl/openssl/pull/28454)

diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
index 8dc06dd52a..f38a64bc6b 100755
--- a/crypto/arm64cpuid.pl
+++ b/crypto/arm64cpuid.pl
@@ -120,6 +120,14 @@ _armv8_sve2_probe:
 	ret
 .size	_armv8_sve2_probe,.-_armv8_sve2_probe

+.globl	_armv8_sve_get_vl_bytes
+.type	_armv8_sve_get_vl_bytes,%function
+_armv8_sve_get_vl_bytes:
+	AARCH64_VALID_CALL_TARGET
+	.inst	0x0420e3e0	// cntb x0
+	ret
+.size	_armv8_sve_get_vl_bytes,.-_armv8_sve_get_vl_bytes
+
 .globl	_armv8_cpuid_probe
 .type	_armv8_cpuid_probe,%function
 _armv8_cpuid_probe:
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index fc780a7080..b037e1b9f1 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -86,9 +86,10 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 # define ARMV8_SHA3      (1<<11)
 # define ARMV8_UNROLL8_EOR3      (1<<12)
 # define ARMV8_SVE       (1<<13)
-# define ARMV8_SVE2      (1<<14)
+# define ARMV9_SVE2      (1<<14)
 # define ARMV8_HAVE_SHA3_AND_WORTH_USING     (1<<15)
 # define ARMV8_UNROLL12_EOR3     (1<<16)
+# define ARMV9_SVE2_POLY1305 (1<<17)

 /*
  * MIDR_EL1 system register
diff --git a/crypto/armcap.c b/crypto/armcap.c
index 7eeea93bd1..84f621aeb8 100644
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@@ -24,11 +24,18 @@
 #include <unistd.h>
 #endif
 #include "arm_arch.h"
+#ifdef __aarch64__
+#include <stdint.h>
+#endif

 unsigned int OPENSSL_armcap_P = 0;
 unsigned int OPENSSL_arm_midr = 0;
 unsigned int OPENSSL_armv8_rsa_neonized = 0;

+#ifdef __aarch64__
+uint64_t _armv8_sve_get_vl_bytes(void);
+#endif
+
 #ifdef _WIN32
 void OPENSSL_cpuid_setup(void)
 {
@@ -346,7 +353,7 @@ void OPENSSL_cpuid_setup(void)
             OPENSSL_armcap_P |= ARMV8_SVE;

         if (getauxval(OSSL_HWCAP2) & OSSL_HWCAP2_SVE2)
-            OPENSSL_armcap_P |= ARMV8_SVE2;
+            OPENSSL_armcap_P |= ARMV9_SVE2;

         if (getauxval(OSSL_HWCAP2) & OSSL_HWCAP2_RNG)
             OPENSSL_armcap_P |= ARMV8_RNG;
@@ -391,7 +398,7 @@ void OPENSSL_cpuid_setup(void)
     }
 #  ifdef __aarch64__
     OPENSSL_armcap_P |= arm_probe_for(_armv8_sve_probe, ARMV8_SVE);
-    OPENSSL_armcap_P |= arm_probe_for(_armv8_sve2_probe, ARMV8_SVE2);
+    OPENSSL_armcap_P |= arm_probe_for(_armv8_sve2_probe, ARMV9_SVE2);
     OPENSSL_armcap_P |= arm_probe_for(_armv8_rng_probe, ARMV8_RNG);
 #  endif

@@ -450,6 +457,17 @@ void OPENSSL_cpuid_setup(void)
          MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_QCOMM, QCOM_CPU_PART_ORYON_X1)) &&
         (OPENSSL_armcap_P & ARMV8_SHA3))
         OPENSSL_armcap_P |= ARMV8_HAVE_SHA3_AND_WORTH_USING;
+    if (OPENSSL_armcap_P & ARMV9_SVE2) {
+        uint64_t vl_bytes = _armv8_sve_get_vl_bytes();
+
+        if (vl_bytes > 16 && (vl_bytes & (vl_bytes - 1)) == 0) {
+            /*
+             * This implementation faster if vector length > 128 bits
+             * But vector length must be a power of 2 (e.g. 256, 512 bits)
+             */
+            OPENSSL_armcap_P |= ARMV9_SVE2_POLY1305;
+        }
+    }
 # endif
 }
 #endif /* _WIN32, __ARM_MAX_ARCH__ >= 7 */
diff --git a/crypto/chacha/asm/chacha-armv8-sve.pl b/crypto/chacha/asm/chacha-armv8-sve.pl
index 62a8be6fe1..40454c3322 100755
--- a/crypto/chacha/asm/chacha-armv8-sve.pl
+++ b/crypto/chacha/asm/chacha-armv8-sve.pl
@@ -756,7 +756,7 @@ ChaCha20_ctr32_sve:
 	mov	$sve2flag,0
 	adrp	$tmp,OPENSSL_armcap_P
 	ldr	$tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
-	tst	$tmpw,#ARMV8_SVE2
+	tst	$tmpw,#ARMV9_SVE2
 	b.eq	1f
 	mov	$sve2flag,1
 	b	2f
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
index cc2052ecc9..6659cd631f 100755
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -69,6 +69,8 @@ $code.=<<___;
 .globl	poly1305_emit
 .hidden	poly1305_emit

+.extern poly1305_blocks_sve2
+
 .type	poly1305_init,%function
 .align	5
 poly1305_init:
@@ -109,6 +111,13 @@ poly1305_init:
 	csel	$d0,$d0,$r0,eq
 	csel	$d1,$d1,$r1,eq

+	tst	w17, #ARMV9_SVE2_POLY1305
+
+	adrp	$r0,poly1305_blocks_sve2
+	add	$r0,$r0,#:lo12:poly1305_blocks_sve2
+
+	csel	$d0,$d0,$r0,eq
+
 #ifdef	__ILP32__
 	stp	w12,w13,[$len]
 #else
diff --git a/crypto/poly1305/asm/poly1305-armv9-sve2.pl b/crypto/poly1305/asm/poly1305-armv9-sve2.pl
new file mode 100755
index 0000000000..b68741fe58
--- /dev/null
+++ b/crypto/poly1305/asm/poly1305-armv9-sve2.pl
@@ -0,0 +1,1420 @@
+#! /usr/bin/env perl
+# Copyright 2016-2025 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+##############################################################################
+#
+# Copyright (c) 2025, Iakov Polyak <iakov.polyak@linaro.org>
+# This file is an SVE2 port-and-merge of POLY1305 hash algorithm, derived from
+# the OpenSSL Neon implementation and a vector length agnostic (VLA)
+# RISC-V implementation from the CRYPTOGAMS project.
+#
+##############################################################################
+#
+# Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions
+#are met:
+#
+#      *	Redistributions of source code must retain copyright notices,
+#	this list of conditions and the following disclaimer.
+#
+#      *	Redistributions in binary form must reproduce the above
+#	copyright notice, this list of conditions and the following
+#	disclaimer in the documentation and/or other materials
+#	provided with the distribution.
+#
+#      *	Neither the name of the CRYPTOGAMS nor the names of its
+#	copyright holder and contributors may be used to endorse or
+#	promote products derived from this software without specific
+#	prior written permission.
+#
+#ALTERNATIVELY, provided that this notice is retained in full, this
+#product may be distributed under the terms of the GNU General Public
+#License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+#those given above.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+##############################################################################
+#
+# September 2025
+#
+# This is a 100% vector length agnostic implementation and has
+# been tested with QEMU for the vector length of up to 2048 bits.
+#
+# On Graviton4, with the vector register length of 128 bits,
+# it is less efficient than the Neon implementation by only 6%.
+# This number has been obtained by running
+# `openssl speed -evp ChaCha20-POLY1305` and
+# `openssl speed -evp ChaCha20`, pinned to a single CPU,
+# converting the 8192-byte result to cycles per byte
+# using actual average runtime CPU frequency from `perf stat`,
+# and taking the difference. On Graviton 4, this results in
+# 0.62 cpb for Neon and 0.66 for SVE2.
+#
+# While Neon should probably be the default choice on a 128-bit architecture,
+# speed-up is clearly expected with 256-bit and larger vector registers
+# in the future.
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+
+my ($h0,$h1,$h2,$r0,$r1,$r2,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+my ($SVE_R0,$SVE_R1,$SVE_S1,$SVE_R2,$SVE_S2,$SVE_R3,$SVE_S3,$SVE_R4,$SVE_S4) = map("z$_.s",(0..8));
+my ($SVE_INlo_0,$SVE_INlo_1,$SVE_INlo_2,$SVE_INlo_3,$SVE_INlo_4) = map("z$_.s",(9..13));
+my ($SVE_INhi_0,$SVE_INhi_1,$SVE_INhi_2,$SVE_INhi_3,$SVE_INhi_4) = map("z$_.s",(14..18));
+my ($SVE_ACC0,$SVE_ACC1,$SVE_ACC2,$SVE_ACC3,$SVE_ACC4) = map("z$_.d",(19..23));
+my ($SVE_H0,$SVE_H1,$SVE_H2,$SVE_H3,$SVE_H4) = map("z$_.s",(24..28));
+my ($SVE_T0,$SVE_T1,$SVE_MASK) = map("z$_",(29..31));
+
+my ($vl,$vl0,$vl1,$vl2,$vl3,$vl4) = ("x16",$h0,$h1,$h2,$r0,$r1);
+my ($cs0,$cs1,$cs2,$cs3,$cs4,$cs5) = map("x$_",(19..24));
+my ($pwr,$mask) = map("x$_",(25..26));
+my $is_base2_26 = "w17";
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.arch armv8-a
+
+.extern poly1305_blocks
+
+// --- poly1305_sw_2_26 ---
+// Performs conversion of 3 base2_44 to 5 base2_26 scalars and
+//  stores them in memory at addresses [x5], [x5,#28], [x5,#56],
+//  [x5,#84] and [x5,#112].
+//
+// This is a leaf function and does not modify stack.
+//
+// Calling Convention:
+//   Inputs:
+//     x5: Pointer into memory where 1st value should be stored.
+//     x7-x9: The three base2_44 scalar values (r0-r2)
+//   Clobbers (uses as temporaries):
+//     x10-x15
+.type	poly1305_sw_2_26,%function
+.align	5
+poly1305_sw_2_26:
+	// Converts 3 base2_44 -> 5 base2_26 values and stores
+	mov		x15,#0x3ffffff			// w15  : 2^26-1 mask
+	and		x10,$r0,x15				// w10 -> r0
+	lsr		x11,$r0,#26				// w11 : top 18 bits of r0
+	str		w10,[x5]				// Store r0
+	bfi		x11,$r1,#18,#8			// w11 -> r1
+	ubfx	x12,$r1,#8,#26			// w12 -> r2
+	str		w11,[x5,#28]			// Store r1
+	lsr		x13,$r1,#34				// w13 : top 10 bits of r1
+	str		w12,[x5,#56]			// Store r2
+	bfi		x13,$r2,#10,#16			// w13 -> r3
+	lsr		x14,$r2,#16				// w14 -> r4
+	str		w13,[x5,#84]			// Store r3
+	str		w14,[x5,#112]			// Store r4
+	ret
+.size   poly1305_sw_2_26,.-poly1305_sw_2_26
+
+// --- poly1305_sqr_2_44 ---
+// Calculates base2_44 squaring operation.
+//
+// This is a leaf function and does not modify stack.
+// It however uses callee-saved registers as scratch, so those must be
+//  saved on stack prior to calling.
+//
+// Calling Convention:
+//   Inputs:
+//     x7-x9: The three base2_44 scalar values (r0-r2)
+//   Outputs:
+//     x7-x9: The three base2_44 scalar values, squared (r0-r2)
+//   Clobbers (uses as temporaries):
+//     x10-x15, x19-x24, x26
+.type	poly1305_sqr_2_44,%function
+.align	5
+poly1305_sqr_2_44:
+
+    // Pre-calculate constants and doubled terms.
+	mov		x12,#20
+	lsl		x13,$r1,#1		// x13 = r1 * 2
+	mul		x12,$r2,x12		// x12 = r2 * 20
+	lsl		x10,$r0,#1		// x10 = r0 * 2
+
+    // --- Calculate d2 = r1*r1 + 2*r0*r2 ---
+	umulh	$cs5,$r1,$r1	// high part of r1*r1
+	mul		$cs4,$r1,$r1	// low part of r1*r1
+	umulh	x15,x10,$r2		// high part of (r0*2)*r2
+	mul		x14,x10,$r2		// low part of (r0*2)*r2
+
+    // --- Calculate d0 = r0*r0 + 20*(2*r1*r2) ---
+	umulh	$cs1,$r0,$r0	// high part of r0*r0
+	mul		$cs0,$r0,$r0	// low part of r0*r0
+	umulh	x11,x13,x12		// high part of (r1*2)*(r2*20)
+	mul		x10,x13,x12		// low part of (r1*2)*(r2*20)
+
+	adds	$cs4,$cs4,x14	// d2_lo
+	adc		$cs5,$cs5,x15	// d2_hi
+
+    // --- Calculate d1 = 2*r0*r1 + 20*r2*r2 ---
+    // d1 is a 128-bit result stored in x7:x6 (hi:lo)
+	umulh	$cs3,$r0,x13	// high part of r0*(r1*2)
+	mul		$cs2,$r0,x13	// low part of r0*(r1*2)
+	umulh	x13,$r2,x12		// high part of r2*(r2*20)
+	mul		x12,$r2,x12		// low part of r2*(r2*20)
+
+	adds	$cs0,$cs0,x10	// d0_lo
+	adc		$cs1,$cs1,x11	// d0_hi
+
+	adds	$cs2,$cs2,x12	// d1_lo
+	adc		$cs3,$cs3,x13	// d1_hi
+
+    // --- Reduction and Carry Propagation ---
+    // Reduce the 128-bit d0, d1, d2 back to three 44-bit limbs in x0, x1, x2
+	lsr		x10,$cs0,#44	// (d0_lo >> 44)
+	lsl		x11,$cs1,#20	// (d0_hi << 20) - high 20 bits are zero
+	and		$r0,$cs0,$mask	// r0 -> d0_lo & mask
+	orr		x10,x10,x11		// x10 -> 64-bit carry from d0
+
+	lsr		x12,$cs2,#44	// (d1_lo >> 44)
+	lsl		x13,$cs3,#20	// (d1_hi << 20)
+	and		$r1,$cs2,$mask	// r1 -> d1_lo & mask
+	orr		x12,x12,x13		// x12 -> 64-bit carry from d1
+	add		$r1,$r1,x10		// r1 += carry from d0
+
+	lsr		x11,$mask,#2	// x11 -> 2^42-1 mask for d2 reduction
+	lsr		x10,$cs4,#42	// (d2_lo >> 42)
+	lsl		x13,$cs5,#22	// (d2_hi << 22)
+	and		$r2,$cs4,x11	// r2 -> d2_lo & 2^42-1 mask
+	orr		x10,x10,x13		// x10 -> final carry from d2
+	add		$r2,$r2,x12		// r2 += carry from d1
+
+    // Handle ripple-carry from r2 and apply the *5 reduction.
+	lsr		x13,$r2,#42		// Get carry from r2 (if r2 >= 2^42)
+	and		$r2,$r2,x11		// Mask r2 back down to 42 bits
+	add		x10,x10,x13		// Add this ripple-carry to the final carry
+
+	add		x11,x10,x10,lsl #2	// x11 -> final_carry * 5
+	add		$r0,$r0,x11			// r0 += final_carry * 5
+
+    // Final ripple-carry chain to ensure all limbs are 44 bits.
+	lsr		x11,$r1,#44		// Get carry from r1
+	and		$r1,$r1,$mask	// Mask r1 to 44 bits
+	add		$r2,$r2,x11		// r2 += carry from r1
+
+	lsr		x10,$r0,#44		// Get carry from r0
+	and		$r0,$r0,$mask	// Mask r0 to 44 bits
+	add		$r1,$r1,x10		// r1 += carry from r0
+
+    ret
+.size	poly1305_sqr_2_44,.-poly1305_sqr_2_44
+
+// --- poly1305_lazy_reduce_sve2 ---
+// Performs lazy reduction on five accumulator vectors as discussed
+// in "NEON crypto" by D.J. Bernstein and P. Schwabe.
+//
+// This is a leaf function and does not modify GPRs or the stack.
+//
+// Calling Convention:
+//   Inputs:
+//     z19-z23: The five 64-bit .d accumulator vectors (ACC0-ACC4)
+//   Outputs:
+//     z24-z28: The five 32-bit .s final limb vectors (H0-H4)
+//     z31: All-zeros (resets mask)
+//   Clobbers (uses as temporaries):
+//     z29, z30
+
+.type	poly1305_lazy_reduce_sve2,%function
+.align	5
+poly1305_lazy_reduce_sve2:
+	dup 	${SVE_MASK}.d,#-1
+	lsr 	${SVE_T0}.d,$SVE_ACC3,#26
+	trn1	$SVE_H3,z22.s,z24.s					// reproducing Neon's `xtn` - treat ACC3 as a .s vector
+	lsr 	${SVE_MASK}.d,${SVE_MASK}.d,#38
+	lsr 	${SVE_T1}.d,$SVE_ACC0,#26
+	and 	$SVE_ACC0,$SVE_ACC0,${SVE_MASK}.d
+	add 	$SVE_ACC4,$SVE_ACC4,${SVE_T0}.d	    // h3 -> h4
+	// Neon's bic is replaced with &=$SVE_MASK (because of using even-indexed elements)
+	and 	z27.d,z27.d,${SVE_MASK}.d			// refer to SVE_H3 as .d
+	add 	$SVE_ACC1,$SVE_ACC1,${SVE_T1}.d	    // h0 -> h1
+
+	lsr 	${SVE_T0}.d,$SVE_ACC4,#26
+	trn1	$SVE_H4,z23.s,z24.s					// reproducing Neon's `xtn` - treat ACC4 as a .s vector
+	lsr 	${SVE_T1}.d,$SVE_ACC1,#26
+	trn1	$SVE_H1,z20.s,z24.s					// reproducing Neon's `xtn` - treat ACC1 as a .s vector
+	and 	z28.d,z28.d,${SVE_MASK}.d			// refer to SVE_H4 as .d
+	add 	$SVE_ACC2,$SVE_ACC2,${SVE_T1}.d	    // h1 -> h2
+
+	add 	$SVE_ACC0,$SVE_ACC0,${SVE_T0}.d
+	lsl 	${SVE_T0}.d,${SVE_T0}.d,#2
+	shrnb	${SVE_T1}.s,$SVE_ACC2,#26			// check it's OK
+	trn1	$SVE_H2,z21.s,z24.s					// reproducing Neon's `xtn` - treat ACC2 as a .s vector
+	add 	$SVE_ACC0,$SVE_ACC0,${SVE_T0}.d		// h4 -> h0
+	and 	z25.d,z25.d,${SVE_MASK}.d			// refer to SVE_H1 as .d
+	add 	$SVE_H3,$SVE_H3,${SVE_T1}.s			// h2 -> h3
+	and 	z26.d,z26.d,${SVE_MASK}.d			// refer to SVE_H2 as .d
+
+	shrnb	${SVE_T0}.s,$SVE_ACC0,#26
+	trn1	$SVE_H0,z19.s,z24.s					// reproducing Neon's `xtn` - treat ACC0 as a .s vector - re-writing H0 here...
+	lsr 	${SVE_T1}.s,$SVE_H3,#26
+	and 	z27.d,z27.d,${SVE_MASK}.d			// refer to SVE_H3 as .d
+	add 	$SVE_H1,$SVE_H1,${SVE_T0}.s			// h0 -> h1
+	and 	z24.d,z24.d,${SVE_MASK}.d			// refer to SVE_H0 as .d
+	add 	$SVE_H4,$SVE_H4,${SVE_T1}.s			// h3 -> h4
+
+	eor 	${SVE_MASK}.d,${SVE_MASK}.d,${SVE_MASK}.d	// reset zero mask
+
+    ret
+.size	poly1305_lazy_reduce_sve2,.-poly1305_lazy_reduce_sve2
+
+// --- poly1305_blocks_sve2 ---
+// Main function, implementing POLY1305 algorithm as discussed
+// in "NEON crypto" by D.J. Bernstein and P. Schwabe, in a VLA fashion,
+// using SVE2.
+//
+// It is mostly a port-and-merge of the 128-bit Neon implementation herein and
+//  a VLA risc-v implementation in https://github.com/dot-asm/cryptogams.
+//
+.globl	poly1305_blocks_sve2
+.type	poly1305_blocks_sve2,%function
+.align	5
+poly1305_blocks_sve2:
+.Lpoly1305_blocks_sve2:
+	AARCH64_VALID_CALL_TARGET
+	ldr	$is_base2_26,[$ctx,#24]
+	// Estimate vector width and branch to scalar if input too short
+	cntd	$vl					// vector width in 64-bit lanes (vl)
+	lsl	$vl0,$vl,#4				// vl * 16 (bytes per vector input blocks)
+	add $vl1,$vl0,$vl0,lsl #1	// 3 * vl * 16 - new threshold.
+	cmp	$len,$vl1
+	b.hs	.Lblocks_sve2
+	cbz	$is_base2_26,.Lshort_blocks	// Call scalar f-n if short; if in base 2^26 - proceed
+
+.Lblocks_sve2:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-144]!		// Allowing for callee-saved reg-s
+	add	x29,sp,#0
+
+	//Store some callee-saved GPRs
+	stp	x19,x20,[sp,#16]
+ 	stp	x21,x22,[sp,#32]
+ 	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+
+	ands	$len,$len,#-16
+	b.eq	.Lno_data_sve2
+
+	cbz	$is_base2_26,.Lbase2_64_sve2
+
+	ldp	w10,w11,[$ctx]			// load hash value base 2^26
+	ldp	w12,w13,[$ctx,#8]
+	ldr	w14,[$ctx,#16]
+
+	neg	$vl1,$vl0				// - (vl * 16)
+	sub	$vl0,$vl0,#1			// (vl * 16) - 1
+	and	$vl2,$len,$vl1			// $len - ($len % (vl * 16)) -> VLA length
+	and	$vl4,$len,$vl0			// $len % (vl * 16) -> scalar remainder
+	cbz	$vl4,.Leven_sve2		// If no scalar "head", proceed to VLA
+	add	$vl3,$inp,$vl4			// Pointer to the start of the VLA data
+	stp	$vl2,$vl3,[sp,#-16]!	// Backup VLA length and ptr
+	mov	$len,$vl4				// So that scalar part knows it's length
+
+	add	$h0,x10,x11,lsl#26		// base 2^26 -> base 2^64
+	lsr	$h1,x12,#12
+	adds	$h0,$h0,x12,lsl#52
+	add	$h1,$h1,x13,lsl#14
+	adc	$h1,$h1,xzr
+	lsr	$h2,x14,#24
+	adds	$h1,$h1,x14,lsl#40
+	adc	$d2,$h2,xzr				// can be partially reduced...
+
+	and	$t0,$d2,#-4				// ... so reduce
+	and	$h2,$d2,#3
+	add	$t0,$t0,$d2,lsr#2
+	adds	$h0,$h0,$t0
+	adcs	$h1,$h1,xzr
+	adc	$h2,$h2,xzr
+
+	stp	$h0,$h1,[$ctx]			// store hash value base 2^64
+	str	$h2,[$ctx,#16]
+
+	bl	poly1305_blocks			// Calculate the scalar "head"
+	ldp	$len,$inp,[sp],#16		// Recover updated length and input ptr
+	ldr	x30,[sp,#8]
+
+	cbz	$padbit,.Lzero_padbit_sve2	// hash already stored in poly1305_blocks
+
+	ldp	$h0,$h1,[$ctx]			// load hash value base 2^64
+	ldr $h2,[$ctx,#16]
+
+	and	x10,$h0,#0x03ffffff		// base 2^64 -> base 2^26
+	ubfx	x11,$h0,#26,#26
+	extr	x12,$h1,$h0,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,$h1,#14,#26
+	extr	x14,$h2,$h1,#40
+
+	cbnz	$len,.Leven_sve2
+
+	stp	w10,w11,[$ctx]			// store hash value base 2^26
+	stp	w12,w13,[$ctx,#8]
+	str	w14,[$ctx,#16]
+	b	.Lno_data_sve2
+
+.align	4
+.Lzero_padbit_sve2:
+	str	xzr,[$ctx,#24]
+	b	.Lno_data_sve2
+
+.align	4
+.Lbase2_64_sve2:
+	neg	$vl1,$vl0				// - (vl * 16)
+	sub	$vl0,$vl0,#1			// (vl * 16) - 1
+	and	$vl2,$len,$vl1			// $len - ($len % (vl * 16)) -> VLA length
+	and	$vl4,$len,$vl0			// $len % (vl * 16) -> scalar remainder
+	cbz	$vl4,.Linit_sve2		// If no scalar "head", proceed to VLA
+	add	$vl3,$inp,$vl4			// Pointer to the start of the VLA data
+	stp	$vl2,$vl3,[sp,#-16]!	// Backup VLA length and ptr
+	mov	$len,$vl4				// So that scalar part knows it's length
+	bl	poly1305_blocks			// Calculate the scalar "head"
+	ldp	$len,$inp,[sp],#16		// Recover updated length and input ptr
+
+.Linit_sve2:
+	// Calculating and storing r-powers (powers of a key).
+	// The layout of how r-powers are stored in memory:
+	//////////////////////////////////////////////////////////////////////////////////////
+	//                   lobe 1                           lobe 2                   etc. //
+	//      | .. r^{max},r^{max/2},...,r^2,r | .. r^{max},r^{max/2},...,r^2,r | ..      //
+	//     / \                              / \                              / \        //
+	//  [$ctx,48]                       [$ctx,48+28]                     [$ctx,48+56]   //
+	//////////////////////////////////////////////////////////////////////////////////////
+
+	ldr w5,[$ctx,#28]		// Load top power (if exists - 0 by default)
+	add $pwr,$ctx,#48+28	// Point to the end of powers allocation (1st lobe)
+
+	mov $mask,#-1
+	lsr $mask,$mask,#20		//2^44-1
+
+	cbnz	w5,.Lpwrs_precomputed
+
+	ldp	$r0,$r1,[$ctx,#32]	// load key value
+
+	lsr	$r2,$r1,#24			// base2_64 -> base2_44
+	extr	$r1,$r1,$r0,#44
+	and	$r0,$r0,$mask
+	and	$r1,$r1,$mask
+
+	mov	x4,$vl
+	add	x5,$pwr,#-4
+	bl	poly1305_sw_2_26
+
+.Loop_pwrs_sqr:
+	lsr	x4,x4,#1
+	add	x5,x5,#-4
+	bl	poly1305_sqr_2_44
+	bl	poly1305_sw_2_26
+	cbnz	 x4,.Loop_pwrs_sqr
+
+	sub	x5,x5,$pwr
+	str	w5,[$ctx,#28]
+
+.Lpwrs_precomputed:
+	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
+	ldr $h2,[$ctx,#16]
+
+	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x11,$h0,#26,#26
+	extr	x12,$h1,$h0,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,$h1,#14,#26
+	extr	x14,$h2,$h1,#40
+
+	stp	d8,d9,[sp,#80]		// meet ABI requirements
+	stp	d10,d11,[sp,#96]
+	stp	d12,d13,[sp,#112]
+	stp	d14,d15,[sp,#128]
+
+    // Zeroing H0-H4 registers
+	eor 	z24.d,z24.d,z24.d  // H0
+	eor 	z25.d,z25.d,z25.d  // H1
+	eor 	z26.d,z26.d,z26.d  // H2
+	eor 	z27.d,z27.d,z27.d  // H3
+	eor 	z28.d,z28.d,z28.d  // H4
+
+	// Using Neon's fmov here for speed.
+	//  We only need the low 26 bits in the first step so no need for post-mov reshuffle.
+	fmov	d24,x10		// H0
+	fmov	d25,x11		// H1
+	fmov	d26,x12		// H2
+	fmov	d27,x13		// H3
+	fmov	d28,x14		// H4
+
+	ldr	x30,[sp,#8]
+
+	mov	x4,#1
+	stur	w4,[$ctx,#24]		// set is_base2_26
+	b	.Ldo_sve2
+
+.align	4
+.Leven_sve2:
+	// In principle all this could be moved to Ldo_sve2
+	stp	d8,d9,[sp,#80]		// meet ABI requirements
+	stp	d10,d11,[sp,#96]
+	stp	d12,d13,[sp,#112]
+	stp	d14,d15,[sp,#128]
+
+	eor 	z24.d,z24.d,z24.d  // H0
+	eor 	z25.d,z25.d,z25.d  // H1
+	eor 	z26.d,z26.d,z26.d  // H2
+	eor 	z27.d,z27.d,z27.d  // H3
+	eor 	z28.d,z28.d,z28.d  // H4
+
+	fmov	d24,x10		// H0
+	fmov	d25,x11		// H1
+	fmov	d26,x12		// H2
+	fmov	d27,x13		// H3
+	fmov	d28,x14		// H4
+
+.Ldo_sve2:
+    ptrue   p0.b, ALL               		// Set all-true predicate
+
+	// Load r-powers.
+	// They are stored in five lobes, in the order r^{max},...,r^2,r^1 each.
+	// We need specific powers to be at specific R- and S-vector indices.
+	// Hence we can't load all of them, an arbitrary amount, dependent on VL.
+	// Instead we load {r^{max},r^{max/2}} and {r^2,r^1} in batches,
+	//  and then interleave them using zip1 as {r^{max},r^2,r^{max/2},r}.
+	// We don't really care where r^{max} and r^{max/2} are, but we want
+	//  r^2 and r to be in either even or odd lanes. We chose lanes 1 and 3.
+	// Intermediate r-powers (r^{max/4},..,r^4), if applicable, will be
+	//  reloaded into lane 0 iteratively in Loop_reduce_sve2.
+
+	ldr 	w5,[$ctx,#28]
+	sxtw	x5,w5				// Zero-extend
+	add 	$pwr,$ctx,#48+28	// Pointer to the end of the r-powers 1st lobe
+	add		x10,$ctx,#48+20		// Pointer to r^2.
+	add		$pwr,$pwr,x5		// Pointer to the r^{max}
+
+	mov		x15,#2
+	whilelo	p1.s,xzr,x15
+
+	// If wouldn't need to load in two chunks, could use ld1rqw -
+	//  optimisation potential for 256-bit vector.
+	ld1w	{ $SVE_R0 },p1/z,[$pwr]
+	ld1w	{ $SVE_T0.s },p1/z,[x10]
+	add		$pwr,$pwr,#28
+	add		x10,x10,#28
+	zip1	$SVE_R0,$SVE_R0,$SVE_T0.s
+
+	ld1w	{ $SVE_R1 },p1/z,[$pwr]
+	ld1w	{ $SVE_T1.s },p1/z,[x10]
+	add		$pwr,$pwr,#28
+	add		x10,x10,#28
+	zip1	$SVE_R1,$SVE_R1,$SVE_T1.s
+
+	ld1w	{ $SVE_R2 },p1/z,[$pwr]
+	ld1w	{ $SVE_T0.s },p1/z,[x10]
+	add		$pwr,$pwr,#28
+	add		x10,x10,#28
+	zip1	$SVE_R2,$SVE_R2,$SVE_T0.s
+
+	ld1w	{ $SVE_R3 },p1/z,[$pwr]
+	ld1w	{ $SVE_T1.s },p1/z,[x10]
+	add		$pwr,$pwr,#28
+	add		x10,x10,#28
+	zip1	$SVE_R3,$SVE_R3,$SVE_T1.s
+
+	ld1w	{ $SVE_R4 },p1/z,[$pwr]
+	ld1w	{ $SVE_T0.s },p1/z,[x10]
+	sub		$pwr,$pwr,#104				// Adjust to 1st lobe, 3d power
+	zip1	$SVE_R4,$SVE_R4,$SVE_T0.s
+
+	// Broadcast r-powers loaded above to higher parts of the R-vectors.
+	cmp		$vl,#2
+	b.eq	.L_skip_dup_broadcast
+	dup		z0.q,z0.q[0]
+	dup		z1.q,z1.q[0]
+	dup		z3.q,z3.q[0]
+	dup		z5.q,z5.q[0]
+	dup		z7.q,z7.q[0]
+
+.L_skip_dup_broadcast:
+	// Calculate S-vectors (r^x*5)
+	adr     $SVE_S1,[$SVE_R1,$SVE_R1,lsl #2]
+	adr     $SVE_S2,[$SVE_R2,$SVE_R2,lsl #2]
+	adr     $SVE_S3,[$SVE_R3,$SVE_R3,lsl #2]
+	adr     $SVE_S4,[$SVE_R4,$SVE_R4,lsl #2]
+
+	// Load initial input blocks
+	lsr		x15,$len,#4
+	whilelo	p1.s,xzr,x15					// Set predicate for blocks loading
+	lsl	$padbit,$padbit,#24
+	ld4w	{ z9.s-z12.s },p1/z,[$inp]		// Loading all blocks at once
+
+#ifdef  __AARCH64EB__
+	revb	z9.s,  p0/m, z9.s
+	revb	z10.s, p0/m, z10.s
+	revb	z11.s, p0/m, z11.s
+	revb	z12.s, p0/m, z12.s
+#endif
+
+	// In-vector (VLA) conversion base2_64 -> base2_26.
+	dup 	${SVE_MASK}.s,#-1
+	lsr 	${SVE_MASK}.s,${SVE_MASK}.s,#6
+
+	lsr		${SVE_T0}.s,z11.s,#14		// T0 -> z11 >> 14
+	lsr		z13.s,z12.s,#8				// z13 -> l4
+	lsl		z11.s,z11.s,#12				// z11 -> upper part of l2
+	lsl		z12.s,z12.s,#18				// z12 -> upper part of l3
+	lsr		${SVE_T1}.s,z10.s,#20		// T1 -> z10 >> 20
+	orr		z12.d,z12.d,${SVE_T0}.d		// z12 -> final l3
+	lsl		z10.s,z10.s,#6				// z10 -> upper part of l1
+	lsr		${SVE_T0}.s,z9.s,#26		// T0 -> z9 >> 26
+	and		z9.d,z9.d,${SVE_MASK}.d		// z9 is now final l0
+	orr		z11.d,z11.d,${SVE_T1}.d		// z11 -> final l2
+	orr		z10.d,z10.d,${SVE_T0}.d		// z10 -> final l1
+	dup		${SVE_T1}.s,w3				// x3 -> $padbit but need it as a word
+	eor 	${SVE_T0}.d,${SVE_T0}.d,${SVE_T0}.d	// set zero mask
+	orr		z13.d,z13.d,${SVE_T1}.d		// l4 += padbit
+	and		z12.d,z12.d,${SVE_MASK}.d	// Mask l3
+	and		z11.d,z11.d,${SVE_MASK}.d	// Mask l2
+	and		z10.d,z10.d,${SVE_MASK}.d	// Mask l1
+
+
+	// Move high blocks from INlo -> INhi and sparcify (put in even lanes)
+	zip2	z14.s,z9.s,${SVE_T0}.s
+	zip2	z18.s,z13.s,${SVE_T0}.s
+	zip2	z17.s,z12.s,${SVE_T0}.s
+	zip2	z16.s,z11.s,${SVE_T0}.s
+	zip2	z15.s,z10.s,${SVE_T0}.s
+
+	// Sparcify blocks to even lanes in INlo
+	zip1	z9.s,z9.s,${SVE_T0}.s
+	zip1	z13.s,z13.s,${SVE_T0}.s
+	zip1	z12.s,z12.s,${SVE_T0}.s
+	zip1	z11.s,z11.s,${SVE_T0}.s
+	zip1	z10.s,z10.s,${SVE_T0}.s
+
+	subs	$len,$len,$vl,lsl #5		// By half vector width * 32
+
+	b.ls	.Lskip_loop_sve2
+
+.align	4
+.Loop_sve2:
+	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// ((inp[0]*r^{vl*2} + inp[vl]  *r^{vl} + inp[2*vl]  )*r^{vl} + inp[3*vl]  )*r^{vl}
+	//+((inp[1]*r^{vl*2} + inp[vl+1]*r^{vl} + inp[2*vl+1])*r^{vl} + inp[3*vl+1])*r^{vl-1}
+	//+...
+	//   \_______________________________/    \_________________________________________/
+	//      first main loop iteration                       long tail
+	//
+	// ((inp[0]*r^{vl*2} + inp[vl]  *r^{vl} + inp[2*vl]  )*r^{vl*2} + inp[3*vl]  *r^{vl} + inp[4*vl]  )*r^{vl}
+	//+((inp[1]*r^{vl*2} + inp[vl+1]*r^{vl} + inp[2*vl+1])*r^{vl*2} + inp[3*vl+1]*r^{vl} + inp[4*vl+1])*r^{vl-1}
+	//+...
+	//   \_______________________________/    \________________________________________/   \___________________/
+	//      first main loop iteration             second main loop iteration                    short tail
+	//
+	// Note that we start with inp[vl:vl*2]*r^{vl}, as it
+	// doesn't depend on reduction in previous iteration.
+	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Hash-key power product f-la for the 5 limbs in base2^26 representation:
+	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
+	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
+	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
+	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
+	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+	add		$inp,$inp,$vl,lsl #5
+
+	umullb	$SVE_ACC4,$SVE_INhi_0,${SVE_R4}[2]
+	umullb	$SVE_ACC3,$SVE_INhi_0,${SVE_R3}[2]
+	umullb	$SVE_ACC2,$SVE_INhi_0,${SVE_R2}[2]
+	umullb	$SVE_ACC1,$SVE_INhi_0,${SVE_R1}[2]
+	umullb	$SVE_ACC0,$SVE_INhi_0,${SVE_R0}[2]
+
+	umlalb	$SVE_ACC4,$SVE_INhi_1,${SVE_R3}[2]
+	umlalb	$SVE_ACC3,$SVE_INhi_1,${SVE_R2}[2]
+	umlalb	$SVE_ACC2,$SVE_INhi_1,${SVE_R1}[2]
+	umlalb	$SVE_ACC1,$SVE_INhi_1,${SVE_R0}[2]
+	umlalb	$SVE_ACC0,$SVE_INhi_1,${SVE_S4}[2]
+
+	umlalb	$SVE_ACC4,$SVE_INhi_2,${SVE_R2}[2]
+	umlalb	$SVE_ACC3,$SVE_INhi_2,${SVE_R1}[2]
+	umlalb	$SVE_ACC2,$SVE_INhi_2,${SVE_R0}[2]
+	umlalb	$SVE_ACC1,$SVE_INhi_2,${SVE_S4}[2]
+	umlalb	$SVE_ACC0,$SVE_INhi_2,${SVE_S3}[2]
+
+	umlalb	$SVE_ACC4,$SVE_INhi_3,${SVE_R1}[2]
+	umlalb	$SVE_ACC3,$SVE_INhi_3,${SVE_R0}[2]
+	umlalb	$SVE_ACC2,$SVE_INhi_3,${SVE_S4}[2]
+	umlalb	$SVE_ACC1,$SVE_INhi_3,${SVE_S3}[2]
+	umlalb	$SVE_ACC0,$SVE_INhi_3,${SVE_S2}[2]
+
+	add 	$SVE_INlo_2,$SVE_INlo_2,$SVE_H2
+	umlalb	$SVE_ACC4,$SVE_INhi_4,${SVE_R0}[2]
+	umlalb	$SVE_ACC3,$SVE_INhi_4,${SVE_S4}[2]
+	umlalb	$SVE_ACC2,$SVE_INhi_4,${SVE_S3}[2]
+	umlalb	$SVE_ACC1,$SVE_INhi_4,${SVE_S2}[2]
+	umlalb	$SVE_ACC0,$SVE_INhi_4,${SVE_S1}[2]
+
+	//////////////////////////////////////////////////////////////////////
+	// (hash+inp[0:vl])*r^{vl*2} and accumulate
+	// Interleave add+mul with loading and converting the next input batch
+
+	add 	$SVE_INlo_0,$SVE_INlo_0,$SVE_H0
+	 lsr	x15,$len,#4
+	umlalb	$SVE_ACC3,$SVE_INlo_2,${SVE_R1}[0]
+	 whilelo	p1.s,xzr,x15
+	umlalb	$SVE_ACC0,$SVE_INlo_2,${SVE_S3}[0]
+	 ld4w	{ z14.s-z17.s }, p1/z, [$inp]
+	umlalb	$SVE_ACC4,$SVE_INlo_2,${SVE_R2}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_2,${SVE_S4}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_2,${SVE_R0}[0]
+
+#ifdef  __AARCH64EB__
+	revb	z14.s, p0/m, z14.s
+	revb	z15.s, p0/m, z15.s
+	revb	z16.s, p0/m, z16.s
+	revb	z17.s, p0/m, z17.s
+#endif
+
+	add 	$SVE_INlo_1,$SVE_INlo_1,$SVE_H1
+	 dup 	${SVE_MASK}.s,#-1
+	umlalb	$SVE_ACC3,$SVE_INlo_0,${SVE_R3}[0]
+	 lsr 	${SVE_MASK}.s,${SVE_MASK}.s,#6
+	umlalb	$SVE_ACC4,$SVE_INlo_0,${SVE_R4}[0]
+	 lsr	${SVE_T0}.s,z16.s,#14		// T0 -> z16 >> 14
+	umlalb	$SVE_ACC2,$SVE_INlo_0,${SVE_R2}[0]
+	 lsr	z18.s,z17.s,#8				// z18 -> l4
+	umlalb	$SVE_ACC0,$SVE_INlo_0,${SVE_R0}[0]
+	 lsl	z16.s,z16.s,#12				// z16 -> upper part of l2
+	umlalb	$SVE_ACC1,$SVE_INlo_0,${SVE_R1}[0]
+	 lsl	z17.s,z17.s,#18				// z17 -> upper part of l3
+
+	add 	$SVE_INlo_3,$SVE_INlo_3,$SVE_H3
+	 lsr	${SVE_T1}.s,z15.s,#20		// T1 -> z15 >> 20
+	umlalb	$SVE_ACC3,$SVE_INlo_1,${SVE_R2}[0]
+	 orr	z17.d,z17.d,${SVE_T0}.d		// z17 -> final l3
+	umlalb	$SVE_ACC4,$SVE_INlo_1,${SVE_R3}[0]
+	 lsl	z15.s,z15.s,#6				// z15 -> upper part of l1
+	umlalb	$SVE_ACC0,$SVE_INlo_1,${SVE_S4}[0]
+	 lsr	${SVE_T0}.s,z14.s,#26		// T0 -> z14 >> 26
+	umlalb	$SVE_ACC2,$SVE_INlo_1,${SVE_R1}[0]
+	 and	z14.d,z14.d,${SVE_MASK}.d	// z14 is now final l0
+	umlalb	$SVE_ACC1,$SVE_INlo_1,${SVE_R0}[0]
+	 orr	z16.d,z16.d,${SVE_T1}.d		// z16 -> final l2
+
+	add 	$SVE_INlo_4,$SVE_INlo_4,$SVE_H4
+	 orr	z15.d,z15.d,${SVE_T0}.d		// z15 -> final l1
+	umlalb	$SVE_ACC3,$SVE_INlo_3,${SVE_R0}[0]
+	 dup	${SVE_T1}.s,w3
+	umlalb	$SVE_ACC0,$SVE_INlo_3,${SVE_S2}[0]
+	 eor 	${SVE_T0}.d,${SVE_T0}.d,${SVE_T0}.d	// set zero mask
+	umlalb	$SVE_ACC4,$SVE_INlo_3,${SVE_R1}[0]
+	 orr	z18.d,z18.d,${SVE_T1}.d		// l4 += padbit
+	umlalb	$SVE_ACC1,$SVE_INlo_3,${SVE_S3}[0]
+	 and	z17.d,z17.d,${SVE_MASK}.d	// Mask l3
+	umlalb	$SVE_ACC2,$SVE_INlo_3,${SVE_S4}[0]
+	 and	z16.d,z16.d,${SVE_MASK}.d	// Mask l2
+
+	umlalb	$SVE_ACC3,$SVE_INlo_4,${SVE_S4}[0]
+	 and	z15.d,z15.d,${SVE_MASK}.d	// Mask l1
+	umlalb	$SVE_ACC0,$SVE_INlo_4,${SVE_S1}[0]
+	 zip1	z9.s,z14.s,${SVE_T0}.s
+	umlalb	$SVE_ACC4,$SVE_INlo_4,${SVE_R0}[0]
+	 zip1	z10.s,z15.s,${SVE_T0}.s
+	umlalb	$SVE_ACC1,$SVE_INlo_4,${SVE_S2}[0]
+	 zip1	z11.s,z16.s,${SVE_T0}.s
+	umlalb	$SVE_ACC2,$SVE_INlo_4,${SVE_S3}[0]
+	 zip1	z12.s,z17.s,${SVE_T0}.s
+	 zip1	z13.s,z18.s,${SVE_T0}.s
+
+	// Sparcify blocks to even lanes in INlo
+	zip2	z14.s,z14.s,${SVE_T0}.s
+	zip2	z15.s,z15.s,${SVE_T0}.s
+	zip2	z16.s,z16.s,${SVE_T0}.s
+	zip2	z17.s,z17.s,${SVE_T0}.s
+	zip2	z18.s,z18.s,${SVE_T0}.s
+
+	subs	$len,$len,$vl,lsl #5
+
+	// Lazy reduction
+	bl		poly1305_lazy_reduce_sve2
+	ldr	x30,[sp,#8]
+
+	b.hi	.Loop_sve2
+
+.Lskip_loop_sve2:
+
+	adds	$len,$len,$vl,lsl #4		// By half the usual input size
+	b.eq	.Lshort_tail_sve2
+
+.Long_tail_sve2:
+	////////////////////////////////////////////////////////////////
+	// (hash + inp[lo])*r^{vl} + inp[hi])*r^{vl..1}               //
+	//  \____________________/                                    //
+	//  first part of long tail                                   //
+	////////////////////////////////////////////////////////////////
+	//NB `vl` here (and in the code) is the vector length in double words.
+	// Intereaving algebra with copying INhi -> INlo for the next steps.
+
+	add 	$SVE_INlo_2,$SVE_INlo_2,$SVE_H2
+	add 	$SVE_INlo_0,$SVE_INlo_0,$SVE_H0
+	add 	$SVE_INlo_1,$SVE_INlo_1,$SVE_H1
+	add 	$SVE_INlo_3,$SVE_INlo_3,$SVE_H3
+	add 	$SVE_INlo_4,$SVE_INlo_4,$SVE_H4
+
+	umullb	$SVE_ACC3,$SVE_INlo_2,${SVE_R1}[2]
+	umullb	$SVE_ACC0,$SVE_INlo_2,${SVE_S3}[2]
+	umullb	$SVE_ACC4,$SVE_INlo_2,${SVE_R2}[2]
+	umullb	$SVE_ACC1,$SVE_INlo_2,${SVE_S4}[2]
+	umullb	$SVE_ACC2,$SVE_INlo_2,${SVE_R0}[2]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_0,${SVE_R3}[2]
+	umlalb	$SVE_ACC4,$SVE_INlo_0,${SVE_R4}[2]
+	umlalb	$SVE_ACC2,$SVE_INlo_0,${SVE_R2}[2]
+	umlalb	$SVE_ACC0,$SVE_INlo_0,${SVE_R0}[2]
+	umlalb	$SVE_ACC1,$SVE_INlo_0,${SVE_R1}[2]
+	mov		z11.d,z16.d
+
+	umlalb	$SVE_ACC3,$SVE_INlo_1,${SVE_R2}[2]
+	umlalb	$SVE_ACC4,$SVE_INlo_1,${SVE_R3}[2]
+	umlalb	$SVE_ACC0,$SVE_INlo_1,${SVE_S4}[2]
+	umlalb	$SVE_ACC2,$SVE_INlo_1,${SVE_R1}[2]
+	umlalb	$SVE_ACC1,$SVE_INlo_1,${SVE_R0}[2]
+	mov		z9.d,z14.d
+
+	umlalb	$SVE_ACC3,$SVE_INlo_3,${SVE_R0}[2]
+	umlalb	$SVE_ACC0,$SVE_INlo_3,${SVE_S2}[2]
+	umlalb	$SVE_ACC4,$SVE_INlo_3,${SVE_R1}[2]
+	umlalb	$SVE_ACC1,$SVE_INlo_3,${SVE_S3}[2]
+	umlalb	$SVE_ACC2,$SVE_INlo_3,${SVE_S4}[2]
+	mov		z10.d,z15.d
+
+	umlalb	$SVE_ACC3,$SVE_INlo_4,${SVE_S4}[2]
+	umlalb	$SVE_ACC0,$SVE_INlo_4,${SVE_S1}[2]
+	umlalb	$SVE_ACC4,$SVE_INlo_4,${SVE_R0}[2]
+	umlalb	$SVE_ACC1,$SVE_INlo_4,${SVE_S2}[2]
+	umlalb	$SVE_ACC2,$SVE_INlo_4,${SVE_S3}[2]
+	mov		z12.d,z17.d
+
+	// Lazy reduction
+	bl		poly1305_lazy_reduce_sve2
+	ldr	x30,[sp,#8]
+
+	mov		z13.d,z18.d
+
+.Lshort_tail_sve2:
+
+	cmp     $vl, #2
+    b.ls    .Last_reduce_sve2
+
+	mov		x15,#1
+	whilelo p1.s,xzr,x15
+
+.Loop_reduce_sve2:
+	////////////////////////////////////////////////////////////////
+	// (hash + inp[hi])*r^{vl/2..2}                               //
+	//       \____________________/                               //
+	//  iterative reduction part of the short tail                //
+	////////////////////////////////////////////////////////////////
+	// Last column of products is calculated by iteratively "folding" vectors:
+	// 1. If vl==2 - skip to Last_reduce_sve2
+	// 2. calculate product with r^{vl/2} -> ACC{0-4}
+	// 3. lazy reduction -> H{0-4}
+	// 4. upper half of vectors (INlo{0-4}) is copied to lower halves
+	// 5. If vl/2==2 - go to Last_reduce_sve2
+	// 6. continue with 2.
+	// NB: this part is skipped for 128-bit case (vl==2)
+	// For 256-bit, no intermediate loading is necessary - r^2 is already in [1].
+	//  So a special case can be easily implemented, when corresponding hardware is available.
+
+	// Load the intermediate r-power into the 0th lanes of vectors
+	// Interleave with broadcasting and S-vector calculation.
+	ldr		w10,[$pwr]
+	ldr		w11,[$pwr,#28]
+	ldr		w12,[$pwr,#56]
+	cpy		$SVE_R0,p1/m,w10
+	ldr		w13,[$pwr,#84]
+	cpy		$SVE_R1,p1/m,w11
+	dup		z0.q,z0.q[0]
+	ldr		w14,[$pwr,#112]
+	cpy		$SVE_R2,p1/m,w12
+	dup		z1.q,z1.q[0]
+	cpy		$SVE_R3,p1/m,w13
+	dup		z3.q,z3.q[0]
+	cpy		$SVE_R4,p1/m,w14
+	add		$pwr,$pwr,#4			// Increment pointer for the next iteration
+	dup		z5.q,z5.q[0]
+	dup		z7.q,z7.q[0]
+
+	// Interleaved hash contraction and S-vector calc.
+	add 	$SVE_INlo_2,$SVE_INlo_2,$SVE_H2
+	adr     $SVE_S1,[$SVE_R1,$SVE_R1,lsl #2]
+	add 	$SVE_INlo_0,$SVE_INlo_0,$SVE_H0
+	adr     $SVE_S2,[$SVE_R2,$SVE_R2,lsl #2]
+	add 	$SVE_INlo_1,$SVE_INlo_1,$SVE_H1
+	adr     $SVE_S3,[$SVE_R3,$SVE_R3,lsl #2]
+	add 	$SVE_INlo_3,$SVE_INlo_3,$SVE_H3
+	adr     $SVE_S4,[$SVE_R4,$SVE_R4,lsl #2]
+	add 	$SVE_INlo_4,$SVE_INlo_4,$SVE_H4
+
+	umullb	$SVE_ACC3,$SVE_INlo_0,${SVE_R3}[0]
+	umullb	$SVE_ACC4,$SVE_INlo_0,${SVE_R4}[0]
+	umullb	$SVE_ACC2,$SVE_INlo_0,${SVE_R2}[0]
+	umullb	$SVE_ACC0,$SVE_INlo_0,${SVE_R0}[0]
+	umullb	$SVE_ACC1,$SVE_INlo_0,${SVE_R1}[0]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_1,${SVE_R2}[0]
+	umlalb	$SVE_ACC4,$SVE_INlo_1,${SVE_R3}[0]
+	umlalb	$SVE_ACC0,$SVE_INlo_1,${SVE_S4}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_1,${SVE_R1}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_1,${SVE_R0}[0]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_2,${SVE_R1}[0]
+	umlalb	$SVE_ACC0,$SVE_INlo_2,${SVE_S3}[0]
+	umlalb	$SVE_ACC4,$SVE_INlo_2,${SVE_R2}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_2,${SVE_S4}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_2,${SVE_R0}[0]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_3,${SVE_R0}[0]
+	umlalb	$SVE_ACC0,$SVE_INlo_3,${SVE_S2}[0]
+	umlalb	$SVE_ACC4,$SVE_INlo_3,${SVE_R1}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_3,${SVE_S3}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_3,${SVE_S4}[0]
+
+	umlalb	$SVE_ACC3,$SVE_INlo_4,${SVE_S4}[0]
+	umlalb	$SVE_ACC0,$SVE_INlo_4,${SVE_S1}[0]
+	umlalb	$SVE_ACC4,$SVE_INlo_4,${SVE_R0}[0]
+	umlalb	$SVE_ACC1,$SVE_INlo_4,${SVE_S2}[0]
+	umlalb	$SVE_ACC2,$SVE_INlo_4,${SVE_S3}[0]
+
+	// Lazy reduction
+	bl		poly1305_lazy_reduce_sve2
+	ldr	x30,[sp,#8]
+
+	// Move higher part of vectors to lower part, depending on current vl
+	// NB look-up is done in terms of single-word lanes, hence indices
+	//  start from vl (refer to as w16) and not vl/2
+	// Higher part now contains "junk"
+	index	${SVE_T0}.s,w16,#1
+	tbl		${SVE_INlo_0},${SVE_INlo_0},${SVE_T0}.s
+	tbl		${SVE_INlo_1},${SVE_INlo_1},${SVE_T0}.s
+	tbl		${SVE_INlo_2},${SVE_INlo_2},${SVE_T0}.s
+	tbl		${SVE_INlo_3},${SVE_INlo_3},${SVE_T0}.s
+	tbl		${SVE_INlo_4},${SVE_INlo_4},${SVE_T0}.s
+	lsr		$vl,$vl,#1		// vl /= 2
+	cmp 	$vl,#2
+	b.hi	.Loop_reduce_sve2
+
+.Last_reduce_sve2:
+	////////////////////////////////////////////////////////////////
+	// (hash + inp[n-1])*r^2                                      //
+	//+(hash + inp[n]  )*r                                        //
+	//       \_____________/                                      //
+	//  Final part of the short tail                              //
+	////////////////////////////////////////////////////////////////
+
+	//Last hash addition - now everything stored in SVE_Hx
+	add 	$SVE_H2,$SVE_H2,$SVE_INlo_2
+	add 	$SVE_H0,$SVE_H0,$SVE_INlo_0
+	add 	$SVE_H1,$SVE_H1,$SVE_INlo_1
+	add 	$SVE_H3,$SVE_H3,$SVE_INlo_3
+	add 	$SVE_H4,$SVE_H4,$SVE_INlo_4
+
+	// Shift even lanes to odd lanes and set even to zero
+	//  because r^2 and r^1 are in lanes 1 and 3 of R-vectors
+	trn1	$SVE_H2,${SVE_MASK}.s,$SVE_H2
+	trn1	$SVE_H0,${SVE_MASK}.s,$SVE_H0
+	trn1	$SVE_H1,${SVE_MASK}.s,$SVE_H1
+	trn1	$SVE_H3,${SVE_MASK}.s,$SVE_H3
+	trn1	$SVE_H4,${SVE_MASK}.s,$SVE_H4
+
+	umullt	$SVE_ACC3,$SVE_H2,${SVE_R1}
+	umullt	$SVE_ACC0,$SVE_H2,${SVE_S3}
+	umullt	$SVE_ACC4,$SVE_H2,${SVE_R2}
+	umullt	$SVE_ACC1,$SVE_H2,${SVE_S4}
+	umullt	$SVE_ACC2,$SVE_H2,${SVE_R0}
+
+	umlalt	$SVE_ACC3,$SVE_H0,${SVE_R3}
+	umlalt	$SVE_ACC4,$SVE_H0,${SVE_R4}
+	umlalt	$SVE_ACC2,$SVE_H0,${SVE_R2}
+	umlalt	$SVE_ACC0,$SVE_H0,${SVE_R0}
+	umlalt	$SVE_ACC1,$SVE_H0,${SVE_R1}
+
+	umlalt	$SVE_ACC3,$SVE_H1,${SVE_R2}
+	umlalt	$SVE_ACC4,$SVE_H1,${SVE_R3}
+	umlalt	$SVE_ACC0,$SVE_H1,${SVE_S4}
+	umlalt	$SVE_ACC2,$SVE_H1,${SVE_R1}
+	umlalt	$SVE_ACC1,$SVE_H1,${SVE_R0}
+
+	umlalt	$SVE_ACC3,$SVE_H3,${SVE_R0}
+	umlalt	$SVE_ACC0,$SVE_H3,${SVE_S2}
+	umlalt	$SVE_ACC4,$SVE_H3,${SVE_R1}
+	umlalt	$SVE_ACC1,$SVE_H3,${SVE_S3}
+	umlalt	$SVE_ACC2,$SVE_H3,${SVE_S4}
+
+	umlalt	$SVE_ACC3,$SVE_H4,${SVE_S4}
+	umlalt	$SVE_ACC0,$SVE_H4,${SVE_S1}
+	umlalt	$SVE_ACC4,$SVE_H4,${SVE_R0}
+	umlalt	$SVE_ACC1,$SVE_H4,${SVE_S2}
+	umlalt	$SVE_ACC2,$SVE_H4,${SVE_S3}
+
+	// Generate predicate for the last two double words
+	mov		x15,#2
+	whilelo p2.d,xzr,x15
+
+	dup 	${SVE_MASK}.d,#-1
+	lsr 	${SVE_MASK}.d,${SVE_MASK}.d,#38
+
+	////////////////////////////////////////////////////////////////
+	// horizontal add
+
+	//In Neon implementation, one effectively using lower 64 bits of vector registers here.
+	//Here and below I use hard-coded FP registers.
+
+	uaddv	d22,p2,$SVE_ACC3
+	 ldp	d8,d9,[sp,#80]		// meet ABI requirements
+	uaddv	d19,p2,$SVE_ACC0
+	 ldp	d10,d11,[sp,#96]
+	uaddv	d23,p2,$SVE_ACC4
+	 ldp	d12,d13,[sp,#112]
+	uaddv	d20,p2,$SVE_ACC1
+	 ldp	d14,d15,[sp,#128]
+	uaddv	d21,p2,$SVE_ACC2
+
+	////////////////////////////////////////////////////////////////
+	// Lazy reduction, but without narrowing
+
+	// Since results were accumulated in the lower 64 bits,
+	//  one can refer to them as FP/aSIMD reg-s.
+
+	ushr	d29,d22,#26
+	and 	v22.8b,v22.8b,v31.8b
+	ushr	d30,d19,#26
+	and 	v19.8b,v19.8b,v31.8b
+
+	add 	d23,d23,d29				// h3 -> h4
+	add 	d20,d20,d30				// h0 -> h1
+
+	ushr	d29,d23,#26
+	and 	v23.8b,v23.8b,v31.8b
+	ushr	d30,d20,#26
+	and 	v20.8b,v20.8b,v31.8b
+	add 	d21,d21,d30				// h1 -> h2
+
+	add 	d19,d19,d29
+	shl 	d29,d29,#2
+	ushr	d30,d21,#26
+	and 	v21.8b,v21.8b,v31.8b
+	add 	d19,d19,d29				// h4 -> h0
+	add 	d22,d22,d30				// h2 -> h3
+
+	ushr	d29,d19,#26
+	and 	v19.8b,v19.8b,v31.8b
+	ushr 	d30,d22,#26
+	and 	v22.8b,v22.8b,v31.8b
+	add 	d20,d20,d29				// h0 -> h1
+	add 	d23,d23,d30				// h3 -> h4
+
+	////////////////////////////////////////////////////////////////
+	// write the result, can be partially reduced
+
+	stp 	s19,s20,[$ctx],#8
+	stp 	s21,s22,[$ctx],#8
+	str 	s23,[$ctx]
+
+.Lno_data_sve2:
+	// Restore the callee-saved GPRs
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldr	x29,[sp],#144
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Lshort_blocks:
+	b	poly1305_blocks
+
+.size	poly1305_blocks_sve2,.-poly1305_blocks_sve2
+___
+
+##############################################################################
+#
+# SVE instruction encoder, adapted from chacha20-sve.pl
+#
+##############################################################################
+
+my $debug_encoder = 0;
+
+{
+my  %opcode_unpred = (
+	"eor"          => 0x04a03000,
+	"add"          => 0x04200000,
+	"orr"          => 0x04603000,
+	"mov"          => 0x04603000, # Alias for ORR
+	"and"          => 0x04203000,
+	"lsl"          => 0x04209C00,
+	"lsr"          => 0x04209400,
+	"zip1"         => 0x05206000,
+	"zip2"         => 0x05206400,
+	"trn1"         => 0x05207000,
+	"dup_gpr"      => 0x05203800,
+	"dup_elem"     => 0x05302000,
+	"cntd"         => 0x04e0e000,
+	"tbl"          => 0x05203000,
+	"adr"          => 0x04a0a000,
+	"umullb"       => 0x44e0d000,
+    "umullt"       => 0x45c07c00,
+    "umlalb"       => 0x44e09000,
+    "umlalt"       => 0x44c04c00,
+	"shrnb"        => 0x45201000);
+
+my  %opcode_imm_unpred = (
+	"dup"          => 0x2538C000,
+	"index"        => 0x04204400);
+
+my %opcode_scalar_pred = (
+	"cpy"          => 0x0528A000);
+
+my  %opcode_pred = (
+	"whilelo"      => 0x25200C00,
+	"ptrue"        => 0x2518E000,
+	"ld4w"         => 0xA560E000,
+	"ld1w"         => 0xA540A000,
+	"revb"         => 0x05248000,
+    "uaddv"        => 0x04012000);
+
+my  %tsize = (
+	'b'          => 0,
+	'h'          => 1,
+	's'          => 2,
+	'd'          => 3,
+	'q'          => 3); # To handle dup zx.q,zx.q[i] case
+
+my %sf = (
+	"w"          => 0,
+	"x"          => 1);
+
+my %pattern = ("ALL" => 31);
+
+sub create_verifier {
+	my $filename="./compile_sve.sh";
+
+$scripts = <<'___';
+#! /bin/bash
+set -e
+CROSS_COMPILE=${CROSS_COMPILE:-'aarch64-linux-gnu-'}
+
+[ -z "$1" ] && exit 1
+INST_TO_COMPILE="$1"
+FILENAME_BASE=${1%% *}
+TMPFILE="/tmp/${FILENAME_BASE}_test"
+OBJDUMP_LOG="/tmp/${FILENAME_BASE}_objdump.log"
+
+echo "--- DEBUG INFO ---" >&2
+echo "Received \$1 (Instruction): '$1'" >&2
+echo "Using Filename Base: '$FILENAME_BASE'" >&2
+echo "------------------" >&2
+
+ARCH=`uname -p | xargs echo -n`
+
+if [ $ARCH == 'aarch64' ]; then
+    CC=gcc-11
+    AS=as
+    OBJDUMP=objdump
+else
+    CC=${CROSS_COMPILE}gcc
+    AS=${CROSS_COMPILE}as
+    OBJDUMP=${CROSS_COMPILE}objdump
+fi
+
+cat > "${TMPFILE}.c" << EOF
+extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
+{
+    asm("$INST_TO_COMPILE");
+}
+int main(int argc, char *argv[])
+{
+}
+EOF
+
+$CC -march=armv8.2-a+sve+sve2 -S -o "${TMPFILE}.s" "${TMPFILE}.c"
+
+$AS -march=armv8-a+sve2 -o "${TMPFILE}.o" "${TMPFILE}.s"
+
+#$OBJDUMP -d "${TMPFILE}.o" > "$OBJDUMP_LOG"
+
+#cat "$OBJDUMP_LOG" | awk -F"\n" -v RS="\n\n" '$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",$2}'
+$OBJDUMP -d "${TMPFILE}.o" | awk -F"\n" -v RS="\n\n" '$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",$2}'
+
+rm "${TMPFILE}.c" "${TMPFILE}.s" "${TMPFILE}.o"
+___
+	open(FH, '>', $filename) or die $!;
+	print FH $scripts;
+	close(FH);
+	system("chmod a+x ./compile_sve.sh");
+}
+
+sub compile_sve {
+	my $inst = shift;
+    return `./compile_sve.sh "$inst"`;
+}
+
+sub verify_inst {
+	my ($code,$inst)=@_;
+	my $hexcode = (sprintf "%08x", $code);
+
+	if ($debug_encoder == 1) {
+		my $expect=&compile_sve($inst);
+		if ($expect ne $hexcode) {
+			return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
+		}
+	}
+	return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
+}
+
+sub reg_code {
+	my $code = shift;
+
+	if ($code == "zr") {
+		return "31";
+	}
+	return $code;
+}
+
+sub encode_size_imm() {
+	my ($mnemonic, $isize, $const)=@_;
+	my $esize = (8<<$tsize{$isize});
+	my $tsize_imm;
+	if ($mnemonic eq "shrnb") {
+        # Formula for narrowing shifts
+        $tsize_imm = $esize - $const;
+    } elsif ($mnemonic eq "lsr") {
+        # Formula for logical right shifts
+        $tsize_imm = 2*$esize - $const;
+    } else {
+        # Default formula for logical left shifts (lsl)
+        $tsize_imm = $esize + $const;
+    }
+	return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
+}
+
+sub sve_unpred {
+    my ($mnemonic,$arg)=@_;
+    my $inst = (sprintf "%s %s", $mnemonic,$arg);
+    # Special case: Widening multiplies (indexed and vector)
+    if (($mnemonic =~ /^(umull[bt]|umlal[bt])/) && $arg =~ m/z([0-9]+)\.d,\s*z([0-9]+)\.s,\s*z([0-9]+)\.s(\[([0-9]+)\])?/o) {
+        my ($zd, $zn, $zm, $indexed, $imm) = ($1, $2, $3, $4, $5);
+        my $opcode = $opcode_unpred{$mnemonic};
+        if ($indexed) {
+			# Split the 2-bit immediate index into its parts.
+            my $i2h = ($imm >> 1) & 0x1; # High bit of index
+            my $i2l = $imm & 0x1;       # Low bit of index
+            # Get the low 4 bits of the Zm register.
+            my $zm_low = $zm & 0xF;
+            return &verify_inst($opcode|($i2h << 20)|($zm_low << 16)|($i2l << 11)|($zn << 5)|$zd,$inst);
+        } else {
+            return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst);
+        }
+    # Special case: 3-register vector ADR with lsl #2
+    } elsif ($mnemonic eq "adr" && $arg =~ m/z([0-9]+)\.s,\s*\[z([0-9]+)\.s,\s*z([0-9]+)\.s,\s*lsl\s*#2\]/o) {
+        my ($zd, $zn, $zm) = ($1, $2, $3);
+        my $opcode = $opcode_unpred{"adr"};
+        # Per the manual, the 'sz' bit (22) must be 0 for .s size.
+        # It is already 0 in our base, so we do nothing.
+        # The 'msz' field (bits 11-10) must be '10'. We achieve this by setting bit 11.
+        $opcode |= (1<<11);
+        return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst);
+    # Special case: 'cntd xd' alias
+    } elsif ($mnemonic eq "cntd" && $arg =~ m/x([0-9]+)/o) {
+        my ($xd) = ($1);
+        my $opcode = $opcode_unpred{$mnemonic};
+        my $pattern_all = $pattern{"ALL"} << 5;
+        return &verify_inst($opcode|$xd|$pattern_all, $inst);
+    # Special parser for SHRNB's unique syntax (Zd.s, Zn.d, #imm)
+    } elsif ($mnemonic eq "shrnb" && $arg =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.([bhsd]),\s*#([0-9]+)/o) {
+        my ($zd, $zn, $size_src, $imm) = ($1, $2, $3, $4);
+        my $opcode = $opcode_unpred{$mnemonic};
+        return &verify_inst($opcode|&encode_size_imm($mnemonic,$size_src,$imm)|($zn << 5)|$zd, $inst);
+	} elsif ($mnemonic eq "dup" && $arg =~ m/z([0-9]+)\.q,\s*z([0-9]+)\.q\[0\]/o) { # DUP from element
+        my ($zd, $zn) = ($1, $2);
+        my $opcode = $opcode_unpred{"dup_elem"};
+        return &verify_inst($opcode | ($zn << 5) | $zd, $inst);
+	} elsif ($mnemonic eq "dup" && $arg =~ m/z([0-9]+)\.([bhsdq]),\s*w([0-9]+)/o) { # DUP from GPR (wX/xX)
+        my ($zd, $size, $rn) = ($1, $2, $3);
+        my $opcode = $opcode_unpred{"dup_gpr"};
+        $opcode |= ($tsize{$size}<<22);
+        return &verify_inst($opcode|$zd|($rn<<5), $inst);
+	# Generic argument patterns
+    } elsif ($arg =~ m/z([0-9]+)\.([bhsdq]),\s*(.*)/o) {
+        my ($zd, $size, $regs) = ($1, $2, $3);
+        my $opcode = $opcode_unpred{$mnemonic};
+		# Handle shift-by-immediate separately due to its unique encoding.
+        if ($mnemonic eq "lsl" || $mnemonic eq "lsr") {
+            if ($regs =~ m/z([0-9]+)\.[bhsd],\s*#([0-9]+)/o) {
+                my ($zn, $imm) = ($1, $2);
+                return &verify_inst($opcode|$zd|($zn<<5)|&encode_size_imm($mnemonic,$size,$imm), $inst);
+            }
+        }
+		if ($mnemonic !~ /^(and|orr|eor|mov)$/) {
+        	$opcode |= ($tsize{$size}<<22);
+    	}
+        if ($regs =~ m/z([0-9]+)\.[bhsdq],\s*z([0-9]+)\.[bhsdq]/o) { # 3-operand vector
+            my ($zn, $zm) = ($1, $2);
+            return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst);
+        } elsif ($regs =~ m/z([0-9]+)\.[bhsdq]/o) { # 2-operand vector (mov)
+            my $zn = $1;
+            my $zm = ($mnemonic eq "mov") ? $zn : 0;
+            return &verify_inst($opcode|$zd|($zn<<5)|($zm<<16), $inst);
+        } elsif ($regs =~ m/w([0-9]+),\s*#1/o) { # index
+            my ($rn, $rm) = ($1, 1);
+            $opcode = $opcode_imm_unpred{"index"};
+			$opcode |= ($tsize{$size}<<22);
+            return &verify_inst($opcode|$zd|($rn<<5)|($rm<<16), $inst);
+        } elsif ($regs =~ m/#(-?[0-9]+)/o) { # dup from immediate
+            my $imm = $1;
+            $opcode = $opcode_imm_unpred{"dup"};
+			$opcode |= ($tsize{$size}<<22);
+            my $imm_val = $imm & 0xff; # Only accounting for a simple case with zero shift.
+            return &verify_inst($opcode|$zd|($imm_val<<5), $inst);
+        }
+    }
+    sprintf "%s // fail to parse: %s", $mnemonic, $arg;
+}
+
+sub sve_pred {
+    my ($mnemonic, $arg)=@_;
+    my $inst = (sprintf "%s %s", $mnemonic,$arg);
+    # Special case: Multi-register loads (ld4w)
+    if ($arg =~ m/\{\s*z([0-9]+)\.s-z([0-9]+)\.s\s*\},\s*p([0-9]+)\/z,\s*\[(x[0-9]+)\]/o) {
+        my ($zt, $pg, $xn) = ($1, $3, $4);
+        $xn =~ s/x//;
+        my $opcode = $opcode_pred{$mnemonic};
+        return &verify_inst($opcode|$zt|($pg<<10)|($xn<<5), $inst);
+    # Special case: Single-register loads (ld1w)
+    } elsif ($arg =~ m/\{\s*z([0-9]+)\.s\s*\},\s*p([0-9]+)\/z,\s*\[(x[0-9]+)\]/o) {
+        my ($zt, $pg, $xn) = ($1, $2, $3);
+        $xn =~ s/x//;
+        my $opcode = $opcode_pred{$mnemonic};
+        return &verify_inst($opcode|$zt|($pg<<10)|($xn<<5), $inst);
+    # Special case: uaddv (scalar destination)
+    } elsif ($mnemonic eq "uaddv" && $arg =~ m/d([0-9]+),\s*p([0-9]+),\s*z([0-9]+)\.([bhsd])/o) {
+        my ($vd, $pg, $zn, $size) = ($1, $2, $3, $4);
+        my $opcode = $opcode_pred{$mnemonic};
+        return &verify_inst($opcode|($tsize{$size}<<22)|$vd|($pg<<10)|($zn<<5), $inst);
+    # Generic pattern: Starts with a predicate register (whilelo, ptrue)
+    } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(.*)/o) {
+        my ($pd, $size, $regs) = ($1, $2, $3);
+        my $opcode = $opcode_pred{$mnemonic};
+        if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) { # whilelo
+            my ($sf_char, $rn, $rm) = ($1, $2, $3);
+            return &verify_inst($opcode|($tsize{$size}<<22)|$pd|($sf{$sf_char}<<12)|(&reg_code($rn)<<5)|(&reg_code($rm)<<16), $inst);
+        } elsif ($regs =~ m/(\w+)/o) { # ptrue
+            my $pat = $1;
+            return &verify_inst($opcode|($tsize{$size}<<22)|$pd|($pattern{$pat}<<5), $inst);
+        }
+    # Generic pattern: Starts with a vector register (cpy, revb)
+    } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/m,\s*(.*)/o) {
+        my ($zd, $size, $pg, $regs) = ($1, $2, $3, $4);
+        if ($regs =~ m/w([0-9]+)/o) { # CPY from GPR
+            my $wn = $1;
+            my $opcode = $opcode_scalar_pred{"cpy"};
+            return &verify_inst($opcode|($tsize{$size}<<22)|$zd|($pg<<10)|($wn<<5), $inst);
+        } elsif ($regs =~ m/z([0-9]+)\.([bhsd])/o) { # 2-operand predicated (revb)
+            my ($zn) = ($1);
+            my $opcode = $opcode_pred{$mnemonic};
+            return &verify_inst($opcode|($tsize{$size}<<22)|$zd|($pg<<10)|($zn<<5), $inst);
+        }
+    }
+    sprintf "%s // fail to parse: %s", $mnemonic, $arg;
+}
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/\/\// and !/^$/);
+	print;
+}
+close SELF;
+
+if ($debug_encoder == 1) {
+	&create_verifier();
+}
+
+foreach my $line (split("\n",$code)) {
+    my $original_line = $line;
+    my $encoded_line = "";
+    # Perform variable substitution
+    $line =~ s/\`([^\`]*)\`/eval($1)/ge;
+    # Predicated instructions
+    if ($line =~ /^\s*(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/) {
+        $encoded_line = sve_pred($1, $2);
+    }
+	elsif ($line =~ /^\s*(\w+)\s+(d[0-9]+,\s*p[0-9].*)/) {
+        $encoded_line = sve_pred($1, $2);
+    }
+    elsif ($line =~ /^\s*(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/) {
+        $encoded_line = sve_pred($1, $2);
+    }
+    elsif ($line =~ /^\s*(\w+)\s+(p[0-9]+\.[bhsd].*)/) {
+        $encoded_line = sve_pred($1, $2);
+    }
+    # Specific unpredicated instructions
+    elsif ($line =~ /^\s*(dup)\s+(z[0-9]+\.q,\s*z[0-9]+\.q\[0\])/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    elsif ($line =~ /^\s*(dup)\s+(z[0-9]+\.[bhsdq],\s*(?:w|x)[0-9]+)/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    elsif ($line =~ /^\s*(mov)\s+(z[0-9]+\.d,\s*z[0-9]+\.d)/) {
+        $encoded_line = sve_unpred("mov", $2);
+    }
+    elsif ($line =~ /^\s*(umull[bt]|umlal[bt])\s+(z[0-9]+\.d,\s*z[0-9]+\.s,\s*z[0-9]+\.s(?:\[[0-9]+\])?)/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    elsif ($line =~ /^\s*(cntd)\s+((x|w)[0-9]+.*)/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    # 3. Generic Unpredicated "catch-all"
+    elsif ($line =~ /^\s*(\w+)\s+(z[0-9]+\.[bhsdq].*)/) {
+        $encoded_line = sve_unpred($1, $2);
+    }
+    if ($encoded_line) {
+        print $encoded_line, "\n";
+    } else {
+        print $original_line, "\n";
+    }
+}
+
+}
+ STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/poly1305/build.info b/crypto/poly1305/build.info
index e359a2225d..5c35c8ecee 100644
--- a/crypto/poly1305/build.info
+++ b/crypto/poly1305/build.info
@@ -14,7 +14,7 @@ IF[{- !$disabled{asm} -}]
   $POLY1305ASM_s390x=poly1305-s390x.S

   $POLY1305ASM_armv4=poly1305-armv4.S
-  $POLY1305ASM_aarch64=poly1305-armv8.S
+  $POLY1305ASM_aarch64=poly1305-armv8.S poly1305-armv9-sve2.S

   $POLY1305ASM_ppc32=poly1305_ppc.c poly1305-ppc.s poly1305-ppcfp.s
   $POLY1305ASM_ppc64=$POLY1305ASM_ppc32
@@ -45,7 +45,9 @@ GENERATE[poly1305-ppcfp.s]=asm/poly1305-ppcfp.pl
 GENERATE[poly1305-armv4.S]=asm/poly1305-armv4.pl
 INCLUDE[poly1305-armv4.o]=..
 GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl
+GENERATE[poly1305-armv9-sve2.S]=asm/poly1305-armv9-sve2.pl
 INCLUDE[poly1305-armv8.o]=..
+INCLUDE[poly1305-armv9-sve2.o]=..
 GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl
 INCLUDE[poly1305-mips.o]=..
 GENERATE[poly1305-c64xplus.S]=asm/poly1305-c64xplus.pl