[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 32/76] target/arm: Set up float_status to use for FPCR.AH=1 behav
From: |
Peter Maydell |
Subject: |
[PATCH 32/76] target/arm: Set up float_status to use for FPCR.AH=1 behaviour |
Date: |
Fri, 24 Jan 2025 16:27:52 +0000 |
When FPCR.AH is 1, the behaviour of some instructions changes:
* AdvSIMD BFCVT, BFCVTN, BFCVTN2, BFMLALB, BFMLALT
* SVE BFCVT, BFCVTNT, BFMLALB, BFMLALT, BFMLSLB, BFMLSLT
* SME BFCVT, BFCVTN, BFMLAL, BFMLSL (these are all in SME2 which
QEMU does not yet implement)
* FRECPE, FRECPS, FRECPX, FRSQRTE, FRSQRTS
The behaviour change is:
* the instructions do not update the FPSR cumulative exception flags
* trapped floating point exceptions are disabled (a no-op for QEMU,
which doesn't implement FPCR.{IDE,IXE,UFE,OFE,DZE,IOE})
* rounding is always round-to-nearest-even regardless of FPCR.RMode
* denormalized inputs and outputs are always flushed to zero, as if
FPCR.{FZ,FIZ} is {1,1}
* FPCR.FZ16 is still honoured for half-precision inputs
(See the Arm ARM DDI0487L.a section A1.5.9.)
We can provide all these behaviours with another pair of float_status fields
which we use only for these insns, when FPCR.AH is 1. These float_status
fields will always have:
* flush_to_zero and flush_inputs_to_zero set for the non-F16 field
* rounding mode set to round-to-nearest-even
and so the only FPCR fields they need to honour are DN and FZ16.
In this commit we only define the new fp_status fields and give them
the required behaviour when FPSR is updated. In subsequent commits
we will arrange to use this new fp_status field for the instructions
that should be affected by FPCR.AH in this way.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
I'm not super enthusiastic about the ah_fp_status naming, which sort
of suggests it's always to be used when AH=1, rather than "for this
specific group of insns when AH=1". But I couldn't think of a better
name that was still reasonably short...
---
target/arm/cpu.h | 15 +++++++++++++++
target/arm/internals.h | 2 ++
target/arm/tcg/translate.h | 14 ++++++++++++++
target/arm/cpu.c | 4 ++++
target/arm/vfp_helper.c | 13 ++++++++++++-
5 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index c8b44c725d0..cfb16151577 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -640,6 +640,13 @@ typedef struct CPUArchState {
* standard_fp_status : the ARM "Standard FPSCR Value"
* standard_fp_status_fp16 : used for half-precision
* calculations with the ARM "Standard FPSCR Value"
+ * ah_fp_status: used for the A64 insns which change behaviour
+ * when FPCR.AH == 1 (bfloat16 conversions and multiplies,
+ * and the reciprocal and square root estimate/step insns)
+ * ah_fp_status_f16: used for the A64 insns which change behaviour
+ * when FPCR.AH == 1 (bfloat16 conversions and multiplies,
+ * and the reciprocal and square root estimate/step insns);
+ * for half-precision
*
* Half-precision operations are governed by a separate
* flush-to-zero control bit in FPSCR:FZ16. We pass a separate
@@ -654,6 +661,12 @@ typedef struct CPUArchState {
* the "standard FPSCR" tracks the FPSCR.FZ16 bit rather than
* using a fixed value for it.
*
+ * The ah_fp_status is needed because some insns have different
+ * behaviour when FPCR.AH == 1: they don't update cumulative
+ * exception flags, they act like FPCR.{FZ,FIZ} = {1,1} and
+ * they ignore FPCR.RMode. But they don't ignore FPCR.FZ16,
+ * which means we need an ah_fp_status_f16 as well.
+ *
* To avoid having to transfer exception bits around, we simply
* say that the FPSCR cumulative exception flags are the logical
* OR of the flags in the four fp statuses. This relies on the
@@ -666,6 +679,8 @@ typedef struct CPUArchState {
float_status fp_status_f16_a64;
float_status standard_fp_status;
float_status standard_fp_status_f16;
+ float_status ah_fp_status;
+ float_status ah_fp_status_f16;
uint64_t zcr_el[4]; /* ZCR_EL[1-3] */
uint64_t smcr_el[4]; /* SMCR_EL[1-3] */
diff --git a/target/arm/internals.h b/target/arm/internals.h
index 98073acc276..b3187341456 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1831,5 +1831,7 @@ int alle1_tlbmask(CPUARMState *env);
/* Set the float_status behaviour to match the Arm defaults */
void arm_set_default_fp_behaviours(float_status *s);
+/* Set the float_status behaviour to match Arm FPCR.AH=1 behaviour */
+void arm_set_ah_fp_behaviours(float_status *s);
#endif
diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index c37c0b539e2..d6edd8db76b 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -676,6 +676,8 @@ typedef enum ARMFPStatusFlavour {
FPST_FPCR_A64,
FPST_FPCR_F16_A32,
FPST_FPCR_F16_A64,
+ FPST_FPCR_AH,
+ FPST_FPCR_AH_F16,
FPST_STD,
FPST_STD_F16,
} ARMFPStatusFlavour;
@@ -696,6 +698,12 @@ typedef enum ARMFPStatusFlavour {
* for AArch32 operations controlled by the FPCR where FPCR.FZ16 is to be
used
* FPST_FPCR_F16_A64
* for AArch64 operations controlled by the FPCR where FPCR.FZ16 is to be
used
+ * FPST_FPCR_AH:
+ * for AArch64 operations which change behaviour when AH=1 (specifically,
+ * bfloat16 conversions and multiplies, and the reciprocal and square root
+ * estimate/step insns)
+ * FPST_FPCR_AH_F16:
+ * ditto, but for half-precision operations
* FPST_STD
* for A32/T32 Neon operations using the "standard FPSCR value"
* FPST_STD_F16
@@ -719,6 +727,12 @@ static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour
flavour)
case FPST_FPCR_F16_A64:
offset = offsetof(CPUARMState, vfp.fp_status_f16_a64);
break;
+ case FPST_FPCR_AH:
+ offset = offsetof(CPUARMState, vfp.ah_fp_status);
+ break;
+ case FPST_FPCR_AH_F16:
+ offset = offsetof(CPUARMState, vfp.ah_fp_status_f16);
+ break;
case FPST_STD:
offset = offsetof(CPUARMState, vfp.standard_fp_status);
break;
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 1ba22c4c7aa..8fa220a7165 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -556,6 +556,10 @@ static void arm_cpu_reset_hold(Object *obj, ResetType type)
arm_set_default_fp_behaviours(&env->vfp.fp_status_f16_a32);
arm_set_default_fp_behaviours(&env->vfp.fp_status_f16_a64);
arm_set_default_fp_behaviours(&env->vfp.standard_fp_status_f16);
+ arm_set_ah_fp_behaviours(&env->vfp.ah_fp_status);
+ set_flush_to_zero(1, &env->vfp.ah_fp_status);
+ set_flush_inputs_to_zero(1, &env->vfp.ah_fp_status);
+ arm_set_ah_fp_behaviours(&env->vfp.ah_fp_status_f16);
#ifndef CONFIG_USER_ONLY
if (kvm_enabled()) {
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
index 2eb75bd7ecc..50a8a659577 100644
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@@ -64,7 +64,7 @@ void arm_set_default_fp_behaviours(float_status *s)
* set Invalid for a QNaN
* * default NaN has sign bit set, msb frac bit set
*/
-static void arm_set_ah_fp_behaviours(float_status *s)
+void arm_set_ah_fp_behaviours(float_status *s)
{
set_float_detect_tininess(float_tininess_after_rounding, s);
set_float_detect_ftz(detect_ftz_after_rounding, s);
@@ -128,6 +128,11 @@ static uint32_t vfp_get_fpsr_from_host(CPUARMState *env)
a64_flags |= get_float_exception_flags(&env->vfp.fp_status_a64);
a64_flags |= (get_float_exception_flags(&env->vfp.fp_status_f16_a64)
& ~(float_flag_input_denormal_flushed |
float_flag_input_denormal_used));
+ /*
+ * We do not merge in flags from ah_fp_status or ah_fp_status_f16, because
+ * they are used for insns that must not set the cumulative exception bits.
+ */
+
/*
* Flushing an input denormal only because FPCR.FIZ == 1 does
* not set FPSR.IDC. So squash it unless (FPCR.AH == 0 && FPCR.FZ == 1).
@@ -154,6 +159,8 @@ static void vfp_clear_float_status_exc_flags(CPUARMState
*env)
set_float_exception_flags(0, &env->vfp.fp_status_f16_a64);
set_float_exception_flags(0, &env->vfp.standard_fp_status);
set_float_exception_flags(0, &env->vfp.standard_fp_status_f16);
+ set_float_exception_flags(0, &env->vfp.ah_fp_status);
+ set_float_exception_flags(0, &env->vfp.ah_fp_status_f16);
}
static void vfp_sync_and_clear_float_status_exc_flags(CPUARMState *env)
@@ -199,9 +206,11 @@ static void vfp_set_fpcr_to_host(CPUARMState *env,
uint32_t val, uint32_t mask)
set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a32);
set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a64);
set_flush_to_zero(ftz_enabled, &env->vfp.standard_fp_status_f16);
+ set_flush_to_zero(ftz_enabled, &env->vfp.ah_fp_status_f16);
set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a32);
set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a64);
set_flush_inputs_to_zero(ftz_enabled,
&env->vfp.standard_fp_status_f16);
+ set_flush_inputs_to_zero(ftz_enabled, &env->vfp.ah_fp_status_f16);
}
if (changed & FPCR_FZ) {
bool ftz_enabled = val & FPCR_FZ;
@@ -225,6 +234,8 @@ static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t
val, uint32_t mask)
set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_a64);
set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16_a32);
set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16_a64);
+ set_default_nan_mode(dnan_enabled, &env->vfp.ah_fp_status);
+ set_default_nan_mode(dnan_enabled, &env->vfp.ah_fp_status_f16);
}
if (changed & FPCR_AH) {
bool ah_enabled = val & FPCR_AH;
--
2.34.1
- [PATCH 36/76] target/arm: Add FPCR.NEP to TBFLAGS, (continued)
- [PATCH 36/76] target/arm: Add FPCR.NEP to TBFLAGS, Peter Maydell, 2025/01/24
- [PATCH 45/76] target/arm: Implement FPCR.AH semantics for scalar FMIN/FMAX, Peter Maydell, 2025/01/24
- [PATCH 27/76] target/arm: Define FPCR AH, FIZ, NEP bits, Peter Maydell, 2025/01/24
- [PATCH 29/76] target/arm: Adjust FP behaviour for FPCR.AH = 1, Peter Maydell, 2025/01/24
- [PATCH 31/76] target/arm: Add FPCR.AH to tbflags, Peter Maydell, 2025/01/24
- [PATCH 32/76] target/arm: Set up float_status to use for FPCR.AH=1 behaviour,
Peter Maydell <=
- [PATCH 42/76] target/arm: Handle FPCR.NEP for scalar FABS and FNEG, Peter Maydell, 2025/01/24
- [PATCH 08/76] target/arm: Use fp_status_a32 in vjvct helper, Peter Maydell, 2025/01/24
- [PATCH 50/76] target/arm: Implement FPCR.AH semantics for SVE FMIN/FMAX immediate, Peter Maydell, 2025/01/24
- [PATCH 55/76] target/arm: Handle FPCR.AH in SVE FNEG, Peter Maydell, 2025/01/24
- [PATCH 11/76] target/arm: Use FPST_FPCR_A64 in A64 decoder, Peter Maydell, 2025/01/24