Commit 467e3d1e42 for qemu.org
commit 467e3d1e422df7f08c70e4be6b12e932b50afa59
Author: Richard Henderson <richard.henderson@linaro.org>
Date: Tue Jun 9 12:20:51 2026 -0700
target/arm: Implement LUTI4 (four registers, 8-bit)
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20260609192110.752384-28-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h
index db1bdb4786..11e8c7a57c 100644
--- a/target/arm/cpu-features.h
+++ b/target/arm/cpu-features.h
@@ -1668,6 +1668,11 @@ static inline bool isar_feature_aa64_sme2_f8cvt(const ARMISARegisters *id)
return isar_feature_aa64_sme2(id) && isar_feature_aa64_f8cvt(id);
}
+static inline bool isar_feature_aa64_sme2p1_lutv2(const ARMISARegisters *id)
+{
+ return isar_feature_aa64_sme2p1(id) && isar_feature_aa64_sme_lutv2(id);
+}
+
static inline bool isar_feature_aa64_sve_i8mm(const ARMISARegisters *id)
{
return isar_feature_aa64_sve(id) && isar_feature_aa64_sme_sve_i8mm(id);
diff --git a/target/arm/tcg/helper-defs.h b/target/arm/tcg/helper-defs.h
index 05ccf795e8..8ec6c16319 100644
--- a/target/arm/tcg/helper-defs.h
+++ b/target/arm/tcg/helper-defs.h
@@ -1120,6 +1120,7 @@ DEF_HELPER_FLAGS_4(sme2_luti4_2b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sme2_luti4_2h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sme2_luti4_2s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
+DEF_HELPER_FLAGS_4(sme2_luti4_4b, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sme2_luti4_4h, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
DEF_HELPER_FLAGS_4(sme2_luti4_4s, TCG_CALL_NO_RWG, void, ptr, ptr, env, i32)
diff --git a/target/arm/tcg/sme.decode b/target/arm/tcg/sme.decode
index 339de72b8a..495330aed7 100644
--- a/target/arm/tcg/sme.decode
+++ b/target/arm/tcg/sme.decode
@@ -1014,8 +1014,14 @@ LUTI4_c_2s 1100 0000 1000 101 idx:2 1 10 00 zn:5 .... 0 &lut zd=%zd_ax2
LUTI4_c_4h 1100 0000 1000 101 idx:1 10 01 00 zn:5 ... 00 &lut zd=%zd_ax4
LUTI4_c_4s 1100 0000 1000 101 idx:1 10 10 00 zn:5 ... 00 &lut zd=%zd_ax4
+LUTI4_c_4b 1100 0000 1000 101 1 00 00 00 ....0 ...00 \
+ &lut zd=%zd_ax4 zn=%zn_ax2 idx=0
+
# LUTI4, strided (must check zd alignment)
LUTI4_s_2b 1100 0000 1001 101 idx:2 1 00 00 zn:5 zd:5 &lut
LUTI4_s_2h 1100 0000 1001 101 idx:2 1 01 00 zn:5 zd:5 &lut
LUTI4_s_4h 1100 0000 1001 101 idx:1 10 01 00 zn:5 zd:5 &lut
+
+LUTI4_s_4b 1100 0000 1001 101 1 00 00 00 ....0 zd:5 \
+ &lut zn=%zn_ax2 idx=0
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index c4bede2ae3..98d3d18791 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -1846,6 +1846,9 @@ TRANS_FEAT(LUTI4_c_2s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_2s, false)
TRANS_FEAT(LUTI4_c_4h, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_4h, false)
TRANS_FEAT(LUTI4_c_4s, aa64_sme2, do_lut, a, gen_helper_sme2_luti4_4s, false)
+TRANS_FEAT(LUTI4_c_4b, aa64_sme_lutv2, do_lut, a,
+ gen_helper_sme2_luti4_4b, false)
+
static bool do_lut_s4(DisasContext *s, arg_lut *a, gen_helper_gvec_2_ptr *fn)
{
return !(a->zd & 0b01100) && do_lut(s, a, fn, true);
@@ -1866,3 +1869,6 @@ TRANS_FEAT(LUTI4_s_2b, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti4_2b)
TRANS_FEAT(LUTI4_s_2h, aa64_sme2p1, do_lut_s8, a, gen_helper_sme2_luti4_2h)
TRANS_FEAT(LUTI4_s_4h, aa64_sme2p1, do_lut_s4, a, gen_helper_sme2_luti4_4h)
+
+TRANS_FEAT(LUTI4_s_4b, aa64_sme2p1_lutv2, do_lut_s4, a,
+ gen_helper_sme2_luti4_4b)
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index f4ff56e034..3c06e93e02 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -3345,6 +3345,20 @@ DO_SME2_LUT(4,4,s, 4)
#undef DO_SME2_LUT
+void helper_sme2_luti4_4b(void *zd, void *zn, CPUARMState *env, uint32_t desc)
+{
+ unsigned vl = simd_oprsz(desc);
+ unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1);
+ unsigned dstride = !strided ? 1 : 4;
+ uint64_t indexes[ARM_MAX_VQ * 4];
+
+ memcpy(&indexes, zn, vl);
+ memcpy((void *)&indexes + vl, zn + sizeof(ARMVectorReg), vl);
+
+ do_lut_b(zd, indexes, (void *)env->za_state.zt0, vl, 0,
+ dstride * sizeof(ARMVectorReg), 4, 32, 4);
+}
+
void HELPER(gvec_luti2_b)(void *vd, void *vn, void *vm, uint32_t desc)
{
unsigned part = simd_data(desc);