With x86_64 as host, we do not have any temporaries with which to
resolve cycles, but we do have xchg. As a side bonus, the set of
graphs that can be made with 3 nodes and all nodes conflicting is
small: two. We can solve the cycle with a single temp.
This is required for x86_64 to handle stores of i128: 1 address
register and 2 data registers.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
static void tcg_out_helper_load_regs(TCGContext *s,
unsigned nmov, TCGMovExtend *mov,
- unsigned ntmp, const int *tmp)
+ const TCGLdstHelperParam *parm)
{
+ TCGReg dst3;
+
switch (nmov) {
- default:
+ case 4:
/* The backend must have provided enough temps for the worst case. */
- tcg_debug_assert(ntmp + 1 >= nmov);
+ tcg_debug_assert(parm->ntmp >= 2);
- for (unsigned i = nmov - 1; i >= 2; --i) {
- TCGReg dst = mov[i].dst;
-
- for (unsigned j = 0; j < i; ++j) {
- if (dst == mov[j].src) {
- /*
- * Conflict.
- * Copy the source to a temporary, recurse for the
- * remaining moves, perform the extension from our
- * scratch on the way out.
- */
- TCGReg scratch = tmp[--ntmp];
- tcg_out_mov(s, mov[i].src_type, scratch, mov[i].src);
- mov[i].src = scratch;
-
- tcg_out_helper_load_regs(s, i, mov, ntmp, tmp);
- tcg_out_movext1(s, &mov[i]);
- return;
- }
+ dst3 = mov[3].dst;
+ for (unsigned j = 0; j < 3; ++j) {
+ if (dst3 == mov[j].src) {
+ /*
+ * Conflict. Copy the source to a temporary, perform the
+ * remaining moves, then the extension from our scratch
+ * on the way out.
+ */
+ TCGReg scratch = parm->tmp[1];
+ tcg_out_movext3(s, mov, mov + 1, mov + 2, parm->tmp[0]);
+ tcg_out_movext1_new_src(s, &mov[3], scratch);