[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH for-next 14/15] tcg-ppc64: Streamline tcg_out_tlb_re
From: |
Richard Henderson |
Subject: |
[Qemu-devel] [PATCH for-next 14/15] tcg-ppc64: Streamline tcg_out_tlb_read |
Date: |
Mon, 5 Aug 2013 08:28:49 -1000 |
Less conditional compilation. Merge an add insn with the indexed
memory load insn. Load the tlb addend earlier. Avoid the address
update memory form.
Signed-off-by: Richard Henderson <address@hidden>
---
tcg/ppc64/tcg-target.c | 202 +++++++++++++++++++++++--------------------------
1 file changed, 95 insertions(+), 107 deletions(-)
diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
index 90d033c..4b23597 100644
--- a/tcg/ppc64/tcg-target.c
+++ b/tcg/ppc64/tcg-target.c
@@ -31,13 +31,11 @@
static uint8_t *tb_ret_addr;
-#define FAST_PATH
-
#if TARGET_LONG_BITS == 32
-#define LD_ADDR LWZU
+#define LD_ADDR LWZ
#define CMP_L 0
#else
-#define LD_ADDR LDU
+#define LD_ADDR LD
#define CMP_L (1<<21)
#endif
@@ -854,39 +852,64 @@ static const void * const qemu_st_helpers[4] = {
helper_stq_mmu,
};
-static void tcg_out_tlb_read(TCGContext *s, TCGReg r0, TCGReg r1, TCGReg r2,
- TCGReg addr_reg, int s_bits, int offset)
+/* Perform the TLB load and compare. Places the result of the comparison
+ in CR7, loads the addend of the TLB into R3, and returns the register
+ containing the guest address (zero-extended into R4). Clobbers R0 and R2.
*/
+
+static TCGReg tcg_out_tlb_read(TCGContext *s, int s_bits, TCGReg addr_reg,
+ int mem_index, bool is_read)
{
-#if TARGET_LONG_BITS == 32
- tcg_out_ext32u(s, addr_reg, addr_reg);
-
- tcg_out_rlw(s, RLWINM, r0, addr_reg,
- 32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS),
- 32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS),
- 31 - CPU_TLB_ENTRY_BITS);
- tcg_out32(s, ADD | TAB(r0, r0, TCG_AREG0));
- tcg_out32(s, LWZU | TAI(r1, r0, offset));
- tcg_out_rlw(s, RLWINM, r2, addr_reg, 0,
- (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
-#else
- tcg_out_rld (s, RLDICL, r0, addr_reg,
- 64 - TARGET_PAGE_BITS,
- 64 - CPU_TLB_BITS);
- tcg_out_shli64(s, r0, r0, CPU_TLB_ENTRY_BITS);
+ size_t offset
+ = (is_read
+ ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+ : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+
+ /* Extract the page index, shifted into place for tlb index. */
+ if (TARGET_LONG_BITS == 32) {
+ /* Zero-extend the address into a place helpful for further use. */
+ tcg_out_ext32u(s, TCG_REG_R4, addr_reg);
+ addr_reg = TCG_REG_R4;
+
+ tcg_out_rlw(s, RLWINM, TCG_REG_R3, addr_reg,
+ 32 - (TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS),
+ 32 - (CPU_TLB_BITS + CPU_TLB_ENTRY_BITS),
+ 31 - CPU_TLB_ENTRY_BITS);
+ } else {
+ tcg_out_rld (s, RLDICL, TCG_REG_R3, addr_reg,
+ 64 - TARGET_PAGE_BITS,
+ 64 - CPU_TLB_BITS);
+ tcg_out_shli64(s, TCG_REG_R3, TCG_REG_R3, CPU_TLB_ENTRY_BITS);
+ }
- tcg_out32(s, ADD | TAB(r0, r0, TCG_AREG0));
- tcg_out32(s, LD_ADDR | TAI(r1, r0, offset));
+ /* Load the tlb comparator. */
+ tcg_out32(s, ADD | TAB(TCG_REG_R3, TCG_REG_R3, TCG_AREG0));
+ tcg_out32(s, LD_ADDR | TAI(TCG_REG_R2, TCG_REG_R3, offset));
- if (!s_bits) {
- tcg_out_rld (s, RLDICR, r2, addr_reg, 0, 63 - TARGET_PAGE_BITS);
- }
- else {
- tcg_out_rld (s, RLDICL, r2, addr_reg,
- 64 - TARGET_PAGE_BITS,
- TARGET_PAGE_BITS - s_bits);
- tcg_out_rld (s, RLDICL, r2, r2, TARGET_PAGE_BITS, 0);
+ /* Load the TLB addend for use on the fast path. Do this asap
+ to minimize any load use delay. */
+ offset = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
+ tcg_out32(s, LD | TAI(TCG_REG_R3, TCG_REG_R3, offset));
+
+ /* Clear the non-page, non-alignment bits from the address. */
+ if (TARGET_LONG_BITS == 32) {
+ tcg_out_rlw(s, RLWINM, TCG_REG_R0, addr_reg, 0,
+ (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS);
+ } else {
+ if (!s_bits) {
+ tcg_out_rld (s, RLDICR, TCG_REG_R0, addr_reg,
+ 0, 63 - TARGET_PAGE_BITS);
+ } else {
+ tcg_out_rld (s, RLDICL, TCG_REG_R0, addr_reg,
+ 64 - TARGET_PAGE_BITS,
+ TARGET_PAGE_BITS - s_bits);
+ tcg_out_rld (s, RLDICL, TCG_REG_R0, TCG_REG_R0,
+ TARGET_PAGE_BITS, 0);
+ }
}
-#endif
+
+ tcg_out32(s, CMP | BF(7) | RA(TCG_REG_R0) | RB(TCG_REG_R2) | CMP_L);
+
+ return addr_reg;
}
#endif
@@ -918,7 +941,7 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg
*args, int opc)
PowerOpcode insn;
int s_bits;
#ifdef CONFIG_SOFTMMU
- TCGReg r2, ir;
+ TCGReg ir;
int mem_index;
void *label1_ptr, *label2_ptr;
#endif
@@ -930,26 +953,16 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg
*args, int opc)
#ifdef CONFIG_SOFTMMU
mem_index = *args;
- r0 = 3;
- r1 = 4;
- r2 = 0;
- rbase = 0;
-
- tcg_out_tlb_read (s, r0, r1, r2, addr_reg, s_bits,
- offsetof (CPUArchState,
tlb_table[mem_index][0].addr_read));
-
- tcg_out32 (s, CMP | BF (7) | RA (r2) | RB (r1) | CMP_L);
+ r0 = tcg_out_tlb_read(s, s_bits, addr_reg, mem_index, true);
label1_ptr = s->code_ptr;
-#ifdef FAST_PATH
- tcg_out32 (s, BC | BI (7, CR_EQ) | BO_COND_TRUE);
-#endif
+ tcg_out32(s, BC | BI (7, CR_EQ) | BO_COND_TRUE);
/* slow path */
- ir = 3;
- tcg_out_mov (s, TCG_TYPE_I64, ir++, TCG_AREG0);
- tcg_out_mov (s, TCG_TYPE_I64, ir++, addr_reg);
- tcg_out_movi (s, TCG_TYPE_I64, ir++, mem_index);
+ ir = TCG_REG_R3;
+ tcg_out_mov(s, TCG_TYPE_I64, ir++, TCG_AREG0);
+ tcg_out_mov(s, TCG_TYPE_I64, ir++, addr_reg);
+ tcg_out_movi(s, TCG_TYPE_I64, ir++, mem_index);
tcg_out_call(s, (tcg_target_long)qemu_ld_helpers[s_bits], 1, LK);
@@ -959,29 +972,23 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg
*args, int opc)
} else if (data_reg != 3) {
tcg_out_mov(s, TCG_TYPE_I64, data_reg, 3);
}
+
label2_ptr = s->code_ptr;
- tcg_out32 (s, B);
+ tcg_out32(s, B);
/* label1: fast path */
-#ifdef FAST_PATH
- reloc_pc14 (label1_ptr, (tcg_target_long) s->code_ptr);
-#endif
-
- /* r0 now contains &env->tlb_table[mem_index][index].addr_read */
- tcg_out32(s, LD | TAI(r0, r0,
- offsetof(CPUTLBEntry, addend)
- - offsetof(CPUTLBEntry, addr_read)));
- /* r0 = env->tlb_table[mem_index][index].addend */
- tcg_out32(s, ADD | TAB(r0, r0, addr_reg));
- /* r0 = env->tlb_table[mem_index][index].addend + addr */
+ reloc_pc14(label1_ptr, (tcg_target_long)s->code_ptr);
+ rbase = TCG_REG_R3;
+ r1 = TCG_REG_R0;
#else /* !CONFIG_SOFTMMU */
-#if TARGET_LONG_BITS == 32
- tcg_out_ext32u(s, addr_reg, addr_reg);
-#endif
- r0 = addr_reg;
- r1 = 3;
rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
+ r0 = addr_reg;
+ r1 = TCG_REG_R0;
+ if (TARGET_LONG_BITS == 32) {
+ r0 = TCG_REG_R2;
+ tcg_out_ext32u(s, r0, addr_reg);
+ }
#endif
insn = qemu_ldx_opc[opc];
@@ -1000,7 +1007,7 @@ static void tcg_out_qemu_ld (TCGContext *s, const TCGArg
*args, int opc)
}
#ifdef CONFIG_SOFTMMU
- reloc_pc24 (label2_ptr, (tcg_target_long) s->code_ptr);
+ reloc_pc24(label2_ptr, (tcg_target_long)s->code_ptr);
#endif
}
@@ -1009,7 +1016,7 @@ static void tcg_out_qemu_st (TCGContext *s, const TCGArg
*args, int opc)
TCGReg addr_reg, r0, r1, rbase, data_reg;
PowerOpcode insn;
#ifdef CONFIG_SOFTMMU
- TCGReg r2, ir;
+ TCGReg ir;
int mem_index;
void *label1_ptr, *label2_ptr;
#endif
@@ -1020,63 +1027,44 @@ static void tcg_out_qemu_st (TCGContext *s, const
TCGArg *args, int opc)
#ifdef CONFIG_SOFTMMU
mem_index = *args;
- r0 = 3;
- r1 = 4;
- r2 = 0;
- rbase = 0;
-
- tcg_out_tlb_read (s, r0, r1, r2, addr_reg, opc,
- offsetof (CPUArchState,
tlb_table[mem_index][0].addr_write));
-
- tcg_out32 (s, CMP | BF (7) | RA (r2) | RB (r1) | CMP_L);
+ r0 = tcg_out_tlb_read(s, opc, addr_reg, mem_index, false);
label1_ptr = s->code_ptr;
-#ifdef FAST_PATH
- tcg_out32 (s, BC | BI (7, CR_EQ) | BO_COND_TRUE);
-#endif
+ tcg_out32(s, BC | BI (7, CR_EQ) | BO_COND_TRUE);
/* slow path */
- ir = 3;
- tcg_out_mov (s, TCG_TYPE_I64, ir++, TCG_AREG0);
- tcg_out_mov (s, TCG_TYPE_I64, ir++, addr_reg);
- tcg_out_rld (s, RLDICL, ir++, data_reg, 0, 64 - (1 << (3 + opc)));
- tcg_out_movi (s, TCG_TYPE_I64, ir++, mem_index);
+ ir = TCG_REG_R3;
+ tcg_out_mov(s, TCG_TYPE_I64, ir++, TCG_AREG0);
+ tcg_out_mov(s, TCG_TYPE_I64, ir++, addr_reg);
+ tcg_out_rld(s, RLDICL, ir++, data_reg, 0, 64 - (1 << (3 + opc)));
+ tcg_out_movi(s, TCG_TYPE_I64, ir++, mem_index);
tcg_out_call(s, (tcg_target_long)qemu_st_helpers[opc], 1, LK);
label2_ptr = s->code_ptr;
- tcg_out32 (s, B);
+ tcg_out32(s, B);
/* label1: fast path */
-#ifdef FAST_PATH
- reloc_pc14 (label1_ptr, (tcg_target_long) s->code_ptr);
-#endif
-
- tcg_out32 (s, (LD
- | RT (r0)
- | RA (r0)
- | (offsetof (CPUTLBEntry, addend)
- - offsetof (CPUTLBEntry, addr_write))
- ));
- /* r0 = env->tlb_table[mem_index][index].addend */
- tcg_out32(s, ADD | TAB(r0, r0, addr_reg));
- /* r0 = env->tlb_table[mem_index][index].addend + addr */
+ reloc_pc14(label1_ptr, (tcg_target_long) s->code_ptr);
+ rbase = TCG_REG_R3;
+ r1 = TCG_REG_R2;
#else /* !CONFIG_SOFTMMU */
-#if TARGET_LONG_BITS == 32
- tcg_out_ext32u(s, addr_reg, addr_reg);
-#endif
- r1 = 3;
- r0 = addr_reg;
rbase = GUEST_BASE ? TCG_GUEST_BASE_REG : 0;
+ r0 = addr_reg;
+ r1 = TCG_REG_R3;
+ if (TARGET_LONG_BITS == 32) {
+ r0 = TCG_REG_R2;
+ tcg_out_ext32u(s, r0, addr_reg);
+ }
#endif
insn = qemu_stx_opc[opc];
if (!HAVE_ISA_2_06 && insn == STDBRX) {
tcg_out32(s, STWBRX | SAB(data_reg, rbase, r0));
tcg_out32(s, ADDI | TAI(r1, r0, 4));
- tcg_out_shri64(s, 0, data_reg, 32);
- tcg_out32(s, STWBRX | SAB(0, rbase, r1));
+ tcg_out_shri64(s, TCG_REG_R0, data_reg, 32);
+ tcg_out32(s, STWBRX | SAB(TCG_REG_R0, rbase, r1));
} else {
tcg_out32(s, insn | SAB(data_reg, rbase, r0));
}
--
1.8.3.1
- [Qemu-devel] [PATCH for-next 03/15] tcg-ppc64: Use the branch absolute instruction when possible, (continued)
- [Qemu-devel] [PATCH for-next 03/15] tcg-ppc64: Use the branch absolute instruction when possible, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 04/15] tcg-ppc64: Don't load the static chain from TCG, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 05/15] tcg-ppc64: Look through the function descriptor when profitable, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 06/15] tcg-ppc64: Move AREG0 to r31, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 07/15] tcg-ppc64: Tidy register allocation order, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 08/15] tcg-ppc64: Create PowerOpcode, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 09/15] tcg-ppc64: Handle long offsets better, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 10/15] tcg-ppc64: Use indirect jump threading, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 11/15] tcg-ppc64: Setup TCG_REG_TB, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 12/15] tcg-ppc64: Use TCG_REG_TB in tcg_out_movi and tcg_out_mem_long, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 14/15] tcg-ppc64: Streamline tcg_out_tlb_read,
Richard Henderson <=
- [Qemu-devel] [PATCH for-next 15/15] tcg-ppc64: Implement CONFIG_QEMU_LDST_OPTIMIZATION, Richard Henderson, 2013/08/05
- [Qemu-devel] [PATCH for-next 13/15] tcg-ppc64: Tidy tcg_target_qemu_prologue, Richard Henderson, 2013/08/05
- Re: [Qemu-devel] [PATCH for-next 00/15] Collection of improvements for tcg/ppc64, Richard Henderson, 2013/08/17