Commit 710be4a712 for qemu.org
commit 710be4a71224f8cdecdd0dd69110a8099bc8a3e6
Author: James Hilliard <james.hilliard1@gmail.com>
Date: Mon May 11 12:22:55 2026 -0600
target/mips: add Octeon QMAC instructions
QMAC.0x and QMACS.0x multiply the selected signed Q15 halfword lane from
rs by rt<15:0> and accumulate the Q31 product into the Octeon HI/LO
accumulator state.
QMAC updates the full 64-bit HI/LO accumulator. QMACS saturates the
32-bit Q31 result in LO and keeps HI<0> as the sticky saturation flag.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: James Hilliard <james.hilliard1@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
[PMD: Add min32/max32 in trans_QMACS()]
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20260520172313.23777-24-philmd@linaro.org>
diff --git a/target/mips/tcg/octeon.decode b/target/mips/tcg/octeon.decode
index 4d0ad05834..2d02b4e0bc 100644
--- a/target/mips/tcg/octeon.decode
+++ b/target/mips/tcg/octeon.decode
@@ -28,6 +28,8 @@ BBIT 11 set:1 . 10 rs:5 ..... offset:s16 p=%bbit_p
# SEQI rt, rs, immediate
# SNE rd, rs, rt
# SNEI rt, rs, immediate
+# QMAC.0x rs, rt
+# QMACS.0x rs, rt
@r3 ...... rs:5 rt:5 rd:5 ..... ......
&cmpi rs rt imm
@@ -43,6 +45,8 @@ SEQ 011100 ..... ..... ..... 00000 101010 @r3
SNE 011100 ..... ..... ..... 00000 101011 @r3
SEQI 011100 rs:5 rt:5 imm:s10 101110 &cmpi
SNEI 011100 rs:5 rt:5 imm:s10 101111 &cmpi
+QMACS 011100 rs:5 rt:5 00000 000 lane:2 010010
+QMAC 011100 rs:5 rt:5 00000 100 lane:2 010010
&r2 rs rt
@r2 ...... rs:5 rt:5 ..... ..... ...... &r2
diff --git a/target/mips/tcg/octeon_translate.c b/target/mips/tcg/octeon_translate.c
index 7ec942fa34..90bd68cbf2 100644
--- a/target/mips/tcg/octeon_translate.c
+++ b/target/mips/tcg/octeon_translate.c
@@ -355,3 +355,65 @@ static bool trans_V3MULU(DisasContext *ctx, arg_V3MULU *a)
gen_store_gpr(tmp, a->rd);
return true;
}
+
+static bool trans_QMAC(DisasContext *ctx, arg_QMAC *a)
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+
+ gen_load_gpr(t0, a->rt);
+ gen_load_gpr(t1, a->rs);
+
+ /* t0 = rt<0> * rs<lane> * 2 */
+ tcg_gen_ext16s_i64(t0, t0);
+ tcg_gen_sextract_i64(t1, t1, a->lane * 16, 16);
+ tcg_gen_mul_i64(t0, t0, t1);
+ tcg_gen_add_i64(t0, t0, t0);
+
+ /* Saturate -0x8000 * -0x8000 * 2 = 0x80000000 -> 0x7fffffff */
+ tcg_gen_smin_i64(t0, t0, tcg_constant_i64(INT32_MAX));
+
+ /* HI:LO += t0 */
+ tcg_gen_concat32_i64(t1, cpu_LO[0], cpu_HI[0]);
+ tcg_gen_add_i64(t0, t0, t1);
+ tcg_gen_sextract_i64(cpu_LO[0], t0, 0, 32);
+ tcg_gen_sextract_i64(cpu_HI[0], t0, 32, 32);
+ return true;
+}
+
+static bool trans_QMACS(DisasContext *ctx, arg_QMACS *a)
+{
+ TCGv_i64 min32 = tcg_constant_i64(INT32_MIN);
+ TCGv_i64 max32 = tcg_constant_i64(INT32_MAX);
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+
+ gen_load_gpr(t0, a->rt);
+ gen_load_gpr(t1, a->rs);
+
+ /* t0 = rt<0> * rs<lane> * 2 */
+ tcg_gen_ext16s_i64(t0, t0);
+ tcg_gen_sextract_i64(t1, t1, a->lane * 16, 16);
+ tcg_gen_mul_i64(t0, t0, t1);
+ tcg_gen_add_i64(t0, t0, t0);
+
+ /*
+ * Saturate -0x8000 * -0x8000 * 2 = 0x80000000 -> 0x7fffffff.
+ * Accumulate overflow in HI[0].
+ */
+ tcg_gen_smin_i64(t1, t0, max32);
+ tcg_gen_setcond_i64(TCG_COND_NE, t0, t0, t1);
+ tcg_gen_or_i64(cpu_HI[0], cpu_HI[0], t0);
+
+ /*
+ * LO = sat32(LO + t0)
+ * Accumulate overflow in HI[0].
+ */
+ tcg_gen_ext32s_i64(t0, cpu_LO[0]);
+ tcg_gen_add_i64(t0, t0, t1);
+ tcg_gen_smin_i64(cpu_LO[0], t0, max32);
+ tcg_gen_smax_i64(cpu_LO[0], cpu_LO[0], min32);
+ tcg_gen_setcond_i64(TCG_COND_NE, t0, t0, cpu_LO[0]);
+ tcg_gen_or_i64(cpu_HI[0], cpu_HI[0], t0);
+ return true;
+}