[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access
From: |
Richard Henderson |
Subject: |
[Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access |
Date: |
Wed, 28 Aug 2013 15:33:33 -0700 |
Moves the load for the tlb addend earlier, to better load latency.
Avoids the writeback from the comparator load, since we know how
to adjust the offset between the two loads.
: e2862c03 add r2, r6, #768 ; 0x300
: e20c00ff and r0, ip, #255 ; 0xff
: e0822280 add r2, r2, r0, lsl #5
-: e1e209d8 ldrd r0, [r2, #152]!
+: e1c209d8 ldrd r0, [r2, #152]
: e31b0007 tst fp, #7 ; 0x7
+: e59220a8 ldr r2, [r2, #168]
: 0150068c cmpeq r0, ip, lsl #13
: 01510007 cmpeq r1, r7
-: e5921010 ldr r1, [r2, #16]
-: 018b40f1 strdeq r4, [fp, r1]
+: 018b40f2 strdeq r4, [fp, r2]
: 1b0000cb blne 0x75e7e6e4
Signed-off-by: Richard Henderson <address@hidden>
---
tcg/arm/tcg-target.c | 97 +++++++++++++++++++++++-----------------------------
1 file changed, 42 insertions(+), 55 deletions(-)
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index f1e547f..6d03d6b 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1179,13 +1179,18 @@ QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
> 0xffff);
-/* Load and compare a TLB entry, leaving the flags set. Leaves R2 pointing
- to the tlb entry. Clobbers R1 and TMP. */
+/* Load and compare a TLB entry, leaving the flags set. Leaves R2
+ containing the tlb addend. Clobbers R0, R1 and TMP. */
static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
- int s_bits, int tlb_offset)
+ int s_bits, int mem_index, bool is_load)
{
TCGReg base = TCG_AREG0;
+ int tlb_offset =
+ (is_load
+ ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+ : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+ int add_offset = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
/* Should generate something like the following:
* pre-v7:
@@ -1193,18 +1198,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg
addrlo, TCGReg addrhi,
* add r2, env, #off & 0xff00
* and r0, tmp, #(CPU_TLB_SIZE - 1) (2)
* add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3)
- * ldr r0, [r2, #off & 0xff]! (4)
+ * ldr r0, [r2, #off & 0xff] (4)
* tst addr_reg, #s_mask
- * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS (5)
- *
- * v7 (not implemented yet):
- * ubfx r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS (1)
- * movw tmp, #~TARGET_PAGE_MASK & ~s_mask
- * movw r0, #off
- * add r2, env, r2, lsl #CPU_TLB_ENTRY_BITS (2)
- * bic tmp, addr_reg, tmp
- * ldr r0, [r2, r0]! (3)
- * cmp r0, tmp (4)
+ * ldr r2, [r2, #addoff] (5)
+ * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS
*/
tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
@@ -1213,7 +1210,6 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg
addrlo, TCGReg addrhi,
if (tlb_offset > 0xff) {
tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
(24 << 7) | (tlb_offset >> 8));
- tlb_offset &= 0xff;
base = TCG_REG_R2;
}
@@ -1226,14 +1222,12 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg
addrlo, TCGReg addrhi,
but due to how the pointer needs setting up, ldm isn't useful.
Base arm5 doesn't have ldrd, but armv5te does. */
if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
- tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0,
- TCG_REG_R2, tlb_offset, 1, 1);
+ tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, tlb_offset & 0xff);
} else {
- tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
- TCG_REG_R2, tlb_offset, 1, 1);
+ tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, tlb_offset & 0xff);
if (TARGET_LONG_BITS == 64) {
- tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R1,
- TCG_REG_R2, 4, 1, 0);
+ tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
+ (tlb_offset & 0xff) + 4);
}
}
@@ -1243,6 +1237,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg
addrlo, TCGReg addrhi,
0, addrlo, (1 << s_bits) - 1);
}
+ /* Load the tlb addend. */
+ tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,
+ add_offset - (tlb_offset & 0xff00));
+
tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
@@ -1360,53 +1358,48 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg
*args, int opc)
mem_index = *args;
s_bits = opc & 3;
- tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
- offsetof(CPUArchState,
tlb_table[mem_index][0].addr_read));
-
- tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
- offsetof(CPUTLBEntry, addend)
- - offsetof(CPUTLBEntry, addr_read));
+ tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 1);
switch (opc) {
case 0:
- tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
break;
case 0 | 4:
- tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
break;
case 1:
- tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
if (bswap) {
tcg_out_bswap16(s, COND_EQ, data_reg, data_reg);
}
break;
case 1 | 4:
if (bswap) {
- tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
tcg_out_bswap16s(s, COND_EQ, data_reg, data_reg);
} else {
- tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
}
break;
case 2:
default:
- tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
if (bswap) {
tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
}
break;
case 3:
if (bswap) {
- tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R1, addr_reg);
- tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R1, 4);
+ tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R2, addr_reg);
+ tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R2, 4);
tcg_out_bswap32(s, COND_EQ, data_reg2, data_reg2);
tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
} else if (use_armv6_instructions
&& (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
- tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
} else {
- tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
- tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+ tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R2, addr_reg);
+ tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R2, 4);
}
break;
}
@@ -1506,47 +1499,41 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg
*args, int opc)
mem_index = *args;
s_bits = opc & 3;
- tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
- offsetof(CPUArchState,
- tlb_table[mem_index][0].addr_write));
-
- tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
- offsetof(CPUTLBEntry, addend)
- - offsetof(CPUTLBEntry, addr_write));
+ tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 0);
switch (opc) {
case 0:
- tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
break;
case 1:
if (bswap) {
tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg);
- tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+ tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R2);
} else {
- tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
}
break;
case 2:
default:
if (bswap) {
tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
- tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+ tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R2);
} else {
- tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
}
break;
case 3:
if (bswap) {
tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2);
- tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg);
+ tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R2, addr_reg);
tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
- tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4);
+ tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R2, 4);
} else if (use_armv6_instructions
&& (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
- tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
} else {
- tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
- tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+ tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R2, addr_reg);
+ tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R2, 4);
}
break;
}
--
1.8.1.4