dotgnu-pnet-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[dotgnu-pnet-commits] libjit ChangeLog jit/jit-apply-x86-64.h jit/jit...


From: Klaus Treichel
Subject: [dotgnu-pnet-commits] libjit ChangeLog jit/jit-apply-x86-64.h jit/jit...
Date: Sun, 02 Mar 2008 17:07:07 +0000

CVSROOT:        /cvsroot/dotgnu-pnet
Module name:    libjit
Changes by:     Klaus Treichel <ktreichel>      08/03/02 17:07:06

Modified files:
        .              : ChangeLog 
        jit            : jit-apply-x86-64.h jit-apply-x86-64.c 
                         jit-gen-x86-64.h jit-insn.c jit-rules.h 
                         jit-value.c Makefile.am 
Added files:
        jit            : jit-rules-x86-64.c jit-rules-x86-64.h 
                         jit-rules-x86-64.ins 

Log message:
        Fix two 64bit arch issues and add first (not pubic usable) native 
support for
        X86_64.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libjit/ChangeLog?cvsroot=dotgnu-pnet&r1=1.348&r2=1.349
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-apply-x86-64.h?cvsroot=dotgnu-pnet&r1=1.6&r2=1.7
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-apply-x86-64.c?cvsroot=dotgnu-pnet&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-gen-x86-64.h?cvsroot=dotgnu-pnet&r1=1.1&r2=1.2
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-insn.c?cvsroot=dotgnu-pnet&r1=1.58&r2=1.59
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-rules.h?cvsroot=dotgnu-pnet&r1=1.16&r2=1.17
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-value.c?cvsroot=dotgnu-pnet&r1=1.13&r2=1.14
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/Makefile.am?cvsroot=dotgnu-pnet&r1=1.23&r2=1.24
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-rules-x86-64.c?cvsroot=dotgnu-pnet&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-rules-x86-64.h?cvsroot=dotgnu-pnet&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-rules-x86-64.ins?cvsroot=dotgnu-pnet&rev=1.1

Patches:
Index: ChangeLog
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/ChangeLog,v
retrieving revision 1.348
retrieving revision 1.349
diff -u -b -r1.348 -r1.349
--- ChangeLog   29 Feb 2008 11:10:41 -0000      1.348
+++ ChangeLog   2 Mar 2008 17:07:05 -0000       1.349
@@ -1,3 +1,31 @@
+2008-03-02  Klaus Treichel  <address@hidden>
+
+       * jit/jit-apply-x86-64.h: define the sizes for indirector and
+       redirector.
+
+       * jit/jit-apply-x86-64.c: do either a memory indirect, RIP relative
+       or register relative jump in the redirector whatever is appropriate
+       for the address location in _jit_create_indirector.
+
+       * jit/jit-gen-x86-64.h: add lots of additional code generation
+       macros and fix some bugs.
+
+       * jit/jit-insn.c: don't mark the current block dead after throwing
+       an exception in jit_insn_call_intrinsic because this is handled in
+       jit_insn_call_native if the flag JIT_CALL_NORETURN is specified.
+
+       * jit/Makefile.am: Add the new files jit-rules-x86-64.c,
+       jit-rules-x86-64.h and jit-rules-x86-64.ins to the sources.
+
+       * jit/jit-rules.h: add the native backend for X86_64.
+
+       * jit/jit-rules-x86-64.c, jit/jit-rules-x86-64.h,
+       jit/jit-rules-x86-64.ins: add the first native  code generation for
+       X86_64.
+
+       * jit/jit-value.c: create a nint constant for long/ulong types in
+       jit_value_create_constant on 64bit archs.
+
 2008-02-29  Aleksey Demakov  <address@hidden>
 
        * include/jit/jit-plus.h, jitplus/jit-plus-jump-table.cpp:

Index: jit/jit-apply-x86-64.h
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-apply-x86-64.h,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -b -r1.6 -r1.7
--- jit/jit-apply-x86-64.h      29 Jan 2008 20:16:32 -0000      1.6
+++ jit/jit-apply-x86-64.h      2 Mar 2008 17:07:06 -0000       1.7
@@ -174,4 +174,17 @@
 #define        jit_closure_size                0x90
 #define        jit_closure_align               0x20
 
+/*
+ * The number of bytes that are needed for a redirector stub.
+ * This includes any extra bytes that are needed for alignment.
+ */
+#define        jit_redirector_size             0x100
+
+/*
+ * The number of bytes that are needed for a indirector stub.
+ * This includes any extra bytes that are needed for alignment.
+ */
+#define        jit_indirector_size             0x10
+
+
 #endif /* _JIT_APPLY_X86_64_H */

Index: jit/jit-apply-x86-64.c
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-apply-x86-64.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- jit/jit-apply-x86-64.c      29 Jan 2008 20:16:32 -0000      1.1
+++ jit/jit-apply-x86-64.c      2 Mar 2008 17:07:06 -0000       1.2
@@ -171,21 +171,30 @@
 
 void *_jit_create_indirector(unsigned char *buf, void **entry)
 {
-       jit_nint offset;
        void *start = (void *)buf;
 
        /* Jump to the entry point. */
-       offset = (jit_nint)entry - ((jit_nint)buf + 5);
-       if((offset < jit_min_int) || (offset > jit_max_int))
+       if(((jit_nint)entry >= jit_min_int) && ((jit_nint)entry <= jit_max_int))
+       {
+               /* We are in the 32bit range so we can use the entry directly. 
*/
+               x86_64_jmp_mem(buf, (jit_nint)entry);
+       }
+       else
+       {
+               jit_nint offset = (jit_nint)entry - ((jit_nint)buf + 7);
+
+               if((offset >= jit_min_int) && (offset <= jit_max_int))
+               {
+                       /* We are in the 32bit range so we can use RIP relative 
addressing. */
+                       x86_64_jmp_membase(buf, X86_64_RIP, offset);
+               }
+               else
        {
                /* offset is outside the 32 bit offset range */
                /* so we have to do an indirect jump via register. */
                x86_64_mov_reg_imm_size(buf, X86_64_R11, (jit_nint)entry, 8);
-               x86_64_jmp_reg(buf, X86_64_R11);
+                       x86_64_jmp_regp(buf, X86_64_R11);
        }
-       else
-       {
-               x86_64_jmp_mem(buf, offset);
        }
 
        return start;

Index: jit/jit-gen-x86-64.h
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-gen-x86-64.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- jit/jit-gen-x86-64.h        29 Jan 2008 20:16:32 -0000      1.1
+++ jit/jit-gen-x86-64.h        2 Mar 2008 17:07:06 -0000       1.2
@@ -95,6 +95,22 @@
 } X86_64_REX_Bits;
 
 /*
+ * Third part of the opcodes for xmm instructions which are encoded
+ * Opcode1: 0xF3 (single precision) or 0xF2 (double precision)
+ *          This is handled as a prefix.
+ * Opcode2: 0x0F
+ */
+typedef enum
+{
+       XMM1_MOV                = 0x10,
+       XMM1_MOV_REV    = 0x11,
+       XMM1_ADD                = 0x58,
+       XMM1_MUL                = 0x59,
+       XMM1_SUB                = 0x5C,
+       XMM1_DIV                = 0x5E
+} X86_64_XMM1_OP;
+
+/*
  * Helper union for emmitting 64 bit immediate values.
  */
 typedef union
@@ -175,18 +191,29 @@
                } \
        } while(0)
 
+/*
+ * Emit the Rex prefix.
+ * The natural size is a power of 2 (1, 2, 4 or 8).
+ * For accessing the low byte registers DIL, SIL, BPL and SPL we have to
+ * generate a Rex prefix with the value 0x40 too.
+ * To enable this OR the natural size with 1.
+ */
 #define x86_64_rex(rex_bits)   (0x40 | (rex_bits))
 #define x86_64_rex_emit(inst, width, modrm_reg, index_reg, rm_base_opcode_reg) 
\
        do { \
                unsigned char __rex_bits = \
-                       (((width) > 4) ? X86_64_REX_W : 0) | \
-                       (((modrm_reg) > 7) ? X86_64_REX_R : 0) | \
-                       (((index_reg) > 7) ? X86_64_REX_X : 0) | \
-                       (((rm_base_opcode_reg) > 7) ? X86_64_REX_B : 0); \
+                       (((width) & 8) ? X86_64_REX_W : 0) | \
+                       (((modrm_reg) & 8) ? X86_64_REX_R : 0) | \
+                       (((index_reg) & 8) ? X86_64_REX_X : 0) | \
+                       (((rm_base_opcode_reg) & 8) ? X86_64_REX_B : 0); \
                if((__rex_bits != 0)) \
                { \
                         *(inst)++ = x86_64_rex(__rex_bits); \
                } \
+               else if(((width) & 1) && ((modrm_reg & 4) || 
(rm_base_opcode_reg & 4))) \
+               { \
+                        *(inst)++ = x86_64_rex(0); \
+               } \
        } while(0)
 
 /*
@@ -854,6 +881,9 @@
  * Instructions with one opcode (plus optional r/m)
  */
 
+/*
+ * Unary opcodes
+ */
 #define x86_64_alu1_reg(inst, opc1, r, reg) \
        do { \
                x86_64_rex_emit((inst), 0, 0, 0, (reg)); \
@@ -943,7 +973,7 @@
                x86_64_memindex_emit((inst), (r), (basereg), (disp), 
(indexreg), (shift)); \
        } while(0)
 
-#define x86_64_alu2_reg_reg_size(inst, opc1, opc2, dreg, sreg, size) \
+#define x86_64_alu1_reg_reg_size(inst, opc1, dreg, sreg, size) \
        do { \
                if((size) == 2) \
                { \
@@ -951,11 +981,10 @@
                } \
                x86_64_rex_emit((inst), (size), (dreg), 0, (sreg)); \
                *(inst)++ = (unsigned char)(opc1); \
-               *(inst)++ = (unsigned char)(opc2); \
                x86_64_reg_emit((inst), (dreg), (sreg));        \
        } while(0)
 
-#define x86_64_alu2_reg_regp_size(inst, opc1, opc2, dreg, sregp, size) \
+#define x86_64_alu1_reg_regp_size(inst, opc1, dreg, sregp, size) \
        do { \
                if((size) == 2) \
                { \
@@ -963,11 +992,10 @@
                } \
                x86_64_rex_emit((inst), (size), (dreg), 0, (sregp)); \
                *(inst)++ = (unsigned char)(opc1); \
-               *(inst)++ = (unsigned char)(opc2); \
                x86_64_regp_emit((inst), (dreg), (sregp));      \
        } while(0)
 
-#define x86_64_alu2_reg_mem_size(inst, opc1, opc2, dreg, mem, size) \
+#define x86_64_alu1_reg_mem_size(inst, opc1, dreg, mem, size) \
        do { \
                if((size) == 2) \
                { \
@@ -975,11 +1003,10 @@
                } \
                x86_64_rex_emit((inst), (size), (dreg), 0, 0); \
                *(inst)++ = (unsigned char)(opc1); \
-               *(inst)++ = (unsigned char)(opc2); \
                x86_64_mem_emit((inst), (dreg), (mem)); \
        } while(0)
 
-#define x86_64_alu2_reg_membase_size(inst, opc1, opc2, dreg, basereg, disp, 
size) \
+#define x86_64_alu1_reg_membase_size(inst, opc1, dreg, basereg, disp, size) \
        do { \
                if((size) == 2) \
                { \
@@ -987,11 +1014,10 @@
                } \
                x86_64_rex_emit((inst), (size), (dreg), 0, (basereg)); \
                *(inst)++ = (unsigned char)(opc1); \
-               *(inst)++ = (unsigned char)(opc2); \
                x86_64_membase_emit((inst), (dreg), (basereg), (disp)); \
        } while(0)
        
-#define x86_64_alu2_reg_memindex_size(inst, opc1, opc2, dreg, basereg, disp, 
indexreg, shift, size) \
+#define x86_64_alu1_reg_memindex_size(inst, opc1, dreg, basereg, disp, 
indexreg, shift, size) \
        do { \
                if((size) == 2) \
                { \
@@ -999,82 +1025,67 @@
                } \
                x86_64_rex_emit((inst), (size), (dreg), (indexreg), (basereg)); 
\
                *(inst)++ = (unsigned char)(opc1); \
-               *(inst)++ = (unsigned char)(opc2); \
                x86_64_memindex_emit((inst), (dreg), (basereg), (disp), 
(indexreg), (shift)); \
        } while(0)
 
-/*
- * xmm instructions with two opcodes
- */
-#define x86_64_xmm2_reg_reg(inst, opc1, opc2, r, reg) \
-       do { \
-               x86_64_rex_emit(inst, 0, (r), 0, (reg)); \
-               *(inst)++ = (unsigned char)(opc1); \
-               *(inst)++ = (unsigned char)(opc2); \
-               x86_64_reg_emit(inst, (r), (reg)); \
-       } while(0)
-
-#define x86_64_xmm2_reg_regp(inst, opc1, opc2, r, regp) \
-       do { \
-               x86_64_rex_emit(inst, 0, (r), 0, (regp)); \
-               *(inst)++ = (unsigned char)(opc1); \
-               *(inst)++ = (unsigned char)(opc2); \
-               x86_64_regp_emit(inst, (r), (regp)); \
-       } while(0)
-
-#define x86_64_xmm2_reg_membase(inst, opc1, opc2, r, basereg, disp) \
-       do { \
-               x86_64_rex_emit(inst, 0, (r), 0, (basereg)); \
-               *(inst)++ = (unsigned char)(opc1); \
-               *(inst)++ = (unsigned char)(opc2); \
-               x86_64_membase_emit(inst, (r), (basereg), (disp)); \
-       } while(0)
-
-#define x86_64_xmm2_reg_memindex(inst, opc1, opc2, r, basereg, disp, indexreg, 
shift) \
+#define x86_64_alu2_reg_reg_size(inst, opc1, opc2, dreg, sreg, size) \
        do { \
-               x86_64_rex_emit(inst, 0, (r), (indexreg), (basereg)); \
+               if((size) == 2) \
+               { \
+                       *(inst)++ = (unsigned char)0x66; \
+               } \
+               x86_64_rex_emit((inst), (size), (dreg), 0, (sreg)); \
                *(inst)++ = (unsigned char)(opc1); \
                *(inst)++ = (unsigned char)(opc2); \
-               x86_64_memindex_emit((inst), (r), (basereg), (disp), 
(indexreg), (shift)); \
+               x86_64_reg_emit((inst), (dreg), (sreg));        \
        } while(0)
 
-/*
- * xmm instructions with a prefix and two opcodes
- */
-#define x86_64_p1_xmm2_reg_reg(inst, p1, opc1, opc2, r, reg) \
+#define x86_64_alu2_reg_regp_size(inst, opc1, opc2, dreg, sregp, size) \
        do { \
-               *(inst)++ = (unsigned char)(p1); \
-               x86_64_rex_emit(inst, 0, (r), 0, (reg)); \
+               if((size) == 2) \
+               { \
+                       *(inst)++ = (unsigned char)0x66; \
+               } \
+               x86_64_rex_emit((inst), (size), (dreg), 0, (sregp)); \
                *(inst)++ = (unsigned char)(opc1); \
                *(inst)++ = (unsigned char)(opc2); \
-               x86_64_reg_emit(inst, (r), (reg)); \
+               x86_64_regp_emit((inst), (dreg), (sregp));      \
        } while(0)
 
-#define x86_64_p1_xmm2_reg_regp(inst, p1, opc1, opc2, r, regp) \
+#define x86_64_alu2_reg_mem_size(inst, opc1, opc2, dreg, mem, size) \
        do { \
-               *(inst)++ = (unsigned char)(p1); \
-               x86_64_rex_emit(inst, 0, (r), 0, (regp)); \
+               if((size) == 2) \
+               { \
+                       *(inst)++ = (unsigned char)0x66; \
+               } \
+               x86_64_rex_emit((inst), (size), (dreg), 0, 0); \
                *(inst)++ = (unsigned char)(opc1); \
                *(inst)++ = (unsigned char)(opc2); \
-               x86_64_regp_emit(inst, (r), (regp)); \
+               x86_64_mem_emit((inst), (dreg), (mem)); \
        } while(0)
 
-#define x86_64_p1_xmm2_reg_membase(inst, p1, opc1, opc2, r, basereg, disp) \
+#define x86_64_alu2_reg_membase_size(inst, opc1, opc2, dreg, basereg, disp, 
size) \
        do { \
-               *(inst)++ = (unsigned char)(p1); \
-               x86_64_rex_emit(inst, 0, (r), 0, (basereg)); \
+               if((size) == 2) \
+               { \
+                       *(inst)++ = (unsigned char)0x66; \
+               } \
+               x86_64_rex_emit((inst), (size), (dreg), 0, (basereg)); \
                *(inst)++ = (unsigned char)(opc1); \
                *(inst)++ = (unsigned char)(opc2); \
-               x86_64_membase_emit(inst, (r), (basereg), (disp)); \
+               x86_64_membase_emit((inst), (dreg), (basereg), (disp)); \
        } while(0)
 
-#define x86_64_p1_xmm2_reg_memindex(inst, p1, opc1, opc2, r, basereg, disp, 
indexreg, shift) \
+#define x86_64_alu2_reg_memindex_size(inst, opc1, opc2, dreg, basereg, disp, 
indexreg, shift, size) \
        do { \
-               *(inst)++ = (unsigned char)(p1); \
-               x86_64_rex_emit(inst, 0, (r), (indexreg), (basereg)); \
+               if((size) == 2) \
+               { \
+                       *(inst)++ = (unsigned char)0x66; \
+               } \
+               x86_64_rex_emit((inst), (size), (dreg), (indexreg), (basereg)); 
\
                *(inst)++ = (unsigned char)(opc1); \
                *(inst)++ = (unsigned char)(opc2); \
-               x86_64_memindex_emit((inst), (r), (basereg), (disp), 
(indexreg), (shift)); \
+               x86_64_memindex_emit((inst), (dreg), (basereg), (disp), 
(indexreg), (shift)); \
        } while(0)
 
 /*
@@ -1873,6 +1884,12 @@
        } while(0)
 
 /*
+ * Note: x86_64_clear_reg () changes the condition code!
+ */
+#define x86_64_clear_reg(inst, reg) \
+       x86_64_xor_reg_reg_size((inst), (reg), (reg), 4)
+
+/*
  * Lea instructions
  */
 #define x86_64_lea_mem_size(inst, dreg, mem, size) \
@@ -1986,15 +2003,43 @@
                        *(inst)++ = (unsigned char)0x66; \
                } \
                x86_64_rex_emit(inst, (size), 0, 0, (dreg)); \
-               if((size) == 1) \
+               switch((size)) \
+               { \
+                       case 1: \
                { \
                        *(inst)++ = (unsigned char)0xb0 + ((dreg) & 0x7); \
+                               x86_imm_emit8(inst, (imm)); \
+                       } \
+                       break; \
+                       case 2: \
+                       { \
+                               *(inst)++ = (unsigned char)0xb8 + ((dreg) & 
0x7); \
+                               x86_imm_emit16(inst, (imm)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                               *(inst)++ = (unsigned char)0xb8 + ((dreg) & 
0x7); \
+                               x86_imm_emit32(inst, (imm)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                               jit_nint __x86_64_imm = (imm); \
+                               if(__x86_64_imm >= jit_min_int && __x86_64_imm 
<= jit_max_int) \
+                               { \
+                                       *(inst)++ = (unsigned char)0xc7; \
+                                       x86_64_reg_emit((inst), 0, (dreg)); \
+                                       x86_imm_emit32(inst, (__x86_64_imm)); \
                } \
                else \
                { \
                        *(inst)++ = (unsigned char)0xb8 + ((dreg) & 0x7); \
+                                       x86_64_imm_emit64(inst, 
(__x86_64_imm)); \
+                               } \
+                       } \
+                       break; \
                } \
-               x86_64_imm_emit_max64(inst, (imm), (size)); \
        } while(0)
 
 /*
@@ -2022,6 +2067,17 @@
                } \
        } while (0)
 
+#define x86_64_mov_reg_regp_size(inst, dreg, sregp, size) \
+       do { \
+               if((size) == 2) \
+               { \
+                       *(inst)++ = (unsigned char)0x66; \
+               } \
+               x86_64_rex_emit(inst, (size), (dreg), 0, (sregp)); \
+               x86_64_opcode1_emit(inst, 0x8a, (size)); \
+               x86_64_regp_emit((inst), (dreg), (sregp)); \
+       } while(0)
+
 #define x86_64_mov_reg_membase_size(inst, dreg, basereg, disp, size) \
        do { \
                if((size) == 2) \
@@ -2062,6 +2118,18 @@
                x86_64_imm_emit_max32(inst, (imm), (size)); \
        } while(0)
 
+#define x86_64_mov_regp_imm_size(inst, dregp, imm, size) \
+       do { \
+               if((size) == 2) \
+               { \
+                       *(inst)++ = (unsigned char)0x66; \
+               } \
+               x86_64_rex_emit(inst, (size), 0, 0, (dregp)); \
+               x86_64_opcode1_emit(inst, 0xc6, (size)); \
+               x86_64_regp_emit((inst), 0, (dregp)); \
+               x86_64_imm_emit_max32(inst, (imm), (size)); \
+       } while(0)
+
 #define x86_64_mov_membase_imm_size(inst, basereg, disp, imm, size) \
        do { \
                if((size) == 2) \
@@ -2087,11 +2155,11 @@
        } while(0)
 
 /*
- * Move with sign extension to the given size (unsigned)
+ * Move with sign extension to the given size (signed)
  */
 #define x86_64_movsx8_reg_reg_size(inst, dreg, sreg, size) \
        do { \
-               x86_64_alu2_reg_reg_size((inst), 0x0f, 0xbe, (dreg), (sreg), 
(size)); \
+               x86_64_alu2_reg_reg_size((inst), 0x0f, 0xbe, (dreg), (sreg), 
(size) | 1); \
        }while(0)
 
 #define x86_64_movsx8_reg_regp_size(inst, dreg, sregp, size) \
@@ -2139,12 +2207,37 @@
                x86_64_alu2_reg_memindex_size((inst), 0x0f, 0xbf, (dreg), 
(basereg), (disp), (indexreg), (shift), (size)); \
        }while(0)
 
+#define x86_64_movsx32_reg_reg_size(inst, dreg, sreg, size) \
+       do { \
+               x86_64_alu1_reg_reg_size((inst), 0x63, (dreg), (sreg), (size)); 
\
+       }while(0)
+
+#define x86_64_movsx32_reg_regp_size(inst, dreg, sregp, size) \
+       do { \
+               x86_64_alu1_reg_regp_size((inst), 0x63, (dreg), (sregp), 
(size)); \
+       }while(0)
+
+#define x86_64_movsx32_reg_mem_size(inst, dreg, mem, size) \
+       do { \
+               x86_64_alu1_reg_mem_size((inst), 0x63, (dreg), (mem), (size)); \
+       }while(0)
+
+#define x86_64_movsx32_reg_membase_size(inst, dreg, basereg, disp, size) \
+       do { \
+               x86_64_alu1_reg_membase_size((inst), 0x63, (dreg), (basereg), 
(disp), (size)); \
+       }while(0)
+
+#define x86_64_movsx32_reg_memindex_size(inst, dreg, basereg, disp, indexreg, 
shift, size) \
+       do { \
+               x86_64_alu1_reg_memindex_size((inst), 0x63, (dreg), (basereg), 
(disp), (indexreg), (shift), (size)); \
+       }while(0)
+
 /*
  * Move with zero extension to the given size (unsigned)
  */
 #define x86_64_movzx8_reg_reg_size(inst, dreg, sreg, size) \
        do { \
-               x86_64_alu2_reg_reg_size((inst), 0x0f, 0xb6, (dreg), (sreg), 
(size)); \
+               x86_64_alu2_reg_reg_size((inst), 0x0f, 0xb6, (dreg), (sreg), 
(size) | 1); \
        }while(0)
 
 #define x86_64_movzx8_reg_regp_size(inst, dreg, sregp, size) \
@@ -2261,10 +2354,50 @@
  */
 #define x86_64_push_imm(inst, imm) \
        do { \
-               x86_push_imm((inst), (imm)); \
+               int _imm = (int) (imm); \
+               if(x86_is_imm8(_imm)) \
+               { \
+                       *(inst)++ = (unsigned char)0x6A; \
+                       x86_imm_emit8 ((inst), (_imm)); \
+               } \
+               else \
+               { \
+                       *(inst)++ = (unsigned char)0x68; \
+                       x86_imm_emit32((inst), (_imm)); \
+               } \
        } while(0)
 
 /*
+ * Use this version if you need a specific width of the value
+ * pushed. The Value on the stack will allways be 64bit wide.
+ */
+#define x86_64_push_imm_size(inst, imm, size) \
+       do { \
+               switch(size) \
+               { \
+                       case 1: \
+                       { \
+                               *(inst)++ = (unsigned char)0x6A; \
+                               x86_imm_emit8((inst), (imm)); \
+                       } \
+                       break; \
+                       case 2: \
+                       { \
+                               *(inst)++ = (unsigned char)0x66; \
+                               *(inst)++ = (unsigned char)0x68; \
+                               x86_imm_emit16((inst), (imm)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                               *(inst)++ = (unsigned char)0x68; \
+                               x86_imm_emit32((inst), (imm)); \
+                       }\
+               } \
+       } while (0)
+
+
+/*
  * Pop instructions have a default size of 64 bit in 64 bit mode.
  * There is no way to encode a 32 bit pop.
  * So only the sizes 2 and 8 are allowed.
@@ -2421,6 +2554,54 @@
        } while(0)
 
 /*
+ * Set the low byte in a register to 0x01 if a condition is met
+ * or 0x00 otherwise.
+ */
+#define x86_64_set_reg(inst, cond, dreg, is_signed) \
+       do { \
+               x86_64_rex_emit((inst), 1, 0, 0, (dreg)); \
+               *(inst)++ = (unsigned char)0x0f; \
+               if((is_signed)) \
+               { \
+                       *(inst)++ = x86_cc_signed_map[(cond)] + 0x20; \
+               } \
+               else \
+               { \
+                       *(inst)++ = x86_cc_unsigned_map[(cond)] + 0x20; \
+               } \
+               x86_64_reg_emit((inst), 0, (dreg)); \
+       } while(0)
+
+#define x86_64_set_mem(inst, cond, mem, is_signed) \
+       do { \
+               *(inst)++ = (unsigned char)0x0f; \
+               if((is_signed)) \
+               { \
+                       *(inst)++ = x86_cc_signed_map[(cond)] + 0x20; \
+               } \
+               else \
+               { \
+                       *(inst)++ = x86_cc_unsigned_map[(cond)] + 0x20; \
+               } \
+               x86_64_mem_emit((inst), 0, (mem)); \
+       } while(0)
+
+#define x86_64_set_membase(inst, cond, basereg, disp, is_signed) \
+       do { \
+               x86_64_rex_emit((inst), 4, 0, 0, (basereg)); \
+               *(inst)++ = (unsigned char)0x0f; \
+               if((is_signed)) \
+               { \
+                       *(inst)++ = x86_cc_signed_map[(cond)] + 0x20;   \
+               } \
+               else    \
+               { \
+                       *(inst)++ = x86_cc_unsigned_map[(cond)] + 0x20; \
+               } \
+               x86_64_membase_emit((inst), 0, (basereg), (disp));      \
+       } while(0)
+
+/*
  * ret
  */
 #define x86_64_ret(inst) \
@@ -2433,79 +2614,1102 @@
  */
 
 /*
- * movaps
+ * xmm instructions with two opcodes
  */
-#define x86_64_movaps_reg_reg(inst, dreg, sreg) \
+#define x86_64_xmm2_reg_reg(inst, opc1, opc2, r, reg) \
        do { \
-               x86_64_xmm2_reg_reg((inst), 0x0f, 0x28, (dreg), (sreg)); \
+               x86_64_rex_emit(inst, 0, (r), 0, (reg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_reg_emit(inst, (r), (reg)); \
        } while(0)
 
-#define x86_64_movaps_membase_reg(inst, basereg, disp, sreg) \
+#define x86_64_xmm2_reg_regp(inst, opc1, opc2, r, regp) \
        do { \
-               x86_64_xmm2_reg_membase((inst), 0x0f, 0x29, (sreg), (basereg), 
(disp)); \
+               x86_64_rex_emit(inst, 0, (r), 0, (regp)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_regp_emit(inst, (r), (regp)); \
        } while(0)
 
-#define x86_64_movaps_memindex_reg(inst, basereg, disp, indexreg, shift, sreg) 
\
+#define x86_64_xmm2_reg_mem(inst, opc1, opc2, r, mem) \
        do { \
-               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x29, (sreg), (basereg), 
(disp), (indexreg), (shift)); \
+               x86_64_rex_emit(inst, 0, (r), 0, 0); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_mem_emit(inst, (r), (mem)); \
        } while(0)
 
-#define x86_64_movaps_regp_reg(inst, dregp, sreg) \
+#define x86_64_xmm2_reg_membase(inst, opc1, opc2, r, basereg, disp) \
        do { \
-               x86_64_xmm2_reg_regp((inst), 0x0f, 0x29, (sreg), (dregp)); \
+               x86_64_rex_emit(inst, 0, (r), 0, (basereg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_membase_emit(inst, (r), (basereg), (disp)); \
        } while(0)
 
-#define x86_64_movaps_reg_regp(inst, dreg, sregp) \
+#define x86_64_xmm2_reg_memindex(inst, opc1, opc2, r, basereg, disp, indexreg, 
shift) \
        do { \
-               x86_64_xmm2_reg_regp((inst), 0x0f, 0x28, (dreg), (sregp)); \
+               x86_64_rex_emit(inst, 0, (r), (indexreg), (basereg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_memindex_emit((inst), (r), (basereg), (disp), 
(indexreg), (shift)); \
        } while(0)
 
-#define x86_64_movaps_reg_membase(inst, dreg, basereg, disp) \
+/*
+ * xmm instructions with a prefix and two opcodes
+ */
+#define x86_64_p1_xmm2_reg_reg_size(inst, p1, opc1, opc2, r, reg, size) \
        do { \
-               x86_64_xmm2_reg_membase((inst), 0x0f, 0x28, (dreg), (basereg), 
(disp)); \
-       } while(0)
-
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), 0, (reg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_reg_emit(inst, (r), (reg)); \
+       } while(0)
+
+#define x86_64_p1_xmm2_reg_regp_size(inst, p1, opc1, opc2, r, regp, size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), 0, (regp)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_regp_emit(inst, (r), (regp)); \
+       } while(0)
+
+#define x86_64_p1_xmm2_reg_mem_size(inst, p1, opc1, opc2, r, mem, size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), 0, 0); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_mem_emit(inst, (r), (mem)); \
+       } while(0)
+
+#define x86_64_p1_xmm2_reg_membase_size(inst, p1, opc1, opc2, r, basereg, 
disp, size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), 0, (basereg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_membase_emit(inst, (r), (basereg), (disp)); \
+       } while(0)
+
+#define x86_64_p1_xmm2_reg_memindex_size(inst, p1, opc1, opc2, r, basereg, 
disp, indexreg, shift, size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), (indexreg), (basereg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               x86_64_memindex_emit((inst), (r), (basereg), (disp), 
(indexreg), (shift)); \
+       } while(0)
+
+/*
+ * xmm1: Macro for use of the X86_64_XMM1 enum
+ */
+#define x86_64_xmm1_reg_reg(inst, opc, dreg, sreg, is_double) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), ((is_double) ? 0xf2 : 
0xf3), 0x0f, (opc), (dreg), (sreg), 0); \
+       } while(0)
+
+#define x86_64_xmm1_reg_regp(inst, opc, dreg, sregp, is_double) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), ((is_double) ? 0xf2 : 
0xf3), 0x0f, (opc), (dreg), (sregp), 0); \
+       } while(0)
+
+#define x86_64_xmm1_reg_mem(inst, opc, dreg, mem, is_double) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), ((is_double) ? 0xf2 : 
0xf3), 0x0f, (opc), (dreg), (mem), 0); \
+       } while(0)
+
+#define x86_64_xmm1_reg_membase(inst, opc, dreg, basereg, disp, is_double) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), ((is_double) ? 0xf2 : 
0xf3), 0x0f, (opc), (dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_xmm1_reg_memindex(inst, opc, dreg, basereg, disp, indexreg, 
shift, is_double) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), ((is_double) ? 0xf2 : 
0xf3), 0x0f, (opc), (dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * Move instructions
+ */
+
+/*
+ * movaps: Move aligned quadword (16 bytes)
+ */
+#define x86_64_movaps_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_xmm2_reg_reg((inst), 0x0f, 0x28, (dreg), (sreg)); \
+       } while(0)
+
+#define x86_64_movaps_regp_reg(inst, dregp, sreg) \
+       do { \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x29, (sreg), (dregp)); \
+       } while(0)
+
+#define x86_64_movaps_mem_reg(inst, mem, sreg) \
+       do { \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x29, (sreg), (mem)); \
+       } while(0)
+
+#define x86_64_movaps_membase_reg(inst, basereg, disp, sreg) \
+       do { \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x29, (sreg), (basereg), 
(disp)); \
+       } while(0)
+
+#define x86_64_movaps_memindex_reg(inst, basereg, disp, indexreg, shift, sreg) 
\
+       do { \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x29, (sreg), (basereg), 
(disp), (indexreg), (shift)); \
+       } while(0)
+
+#define x86_64_movaps_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x28, (dreg), (sregp)); \
+       } while(0)
+
+#define x86_64_movaps_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x28, (dreg), (mem)); \
+       } while(0)
+
+#define x86_64_movaps_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x28, (dreg), (basereg), 
(disp)); \
+       } while(0)
+
 #define x86_64_movaps_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) 
\
        do { \
                x86_64_xmm2_reg_memindex((inst), 0x0f, 0x28, (dreg), (basereg), 
(disp), (indexreg), (shift)); \
        } while(0)
 
 /*
- * movsd
+ * movups: Move unaligned quadword (16 bytes)
+ */
+#define x86_64_movups_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_xmm2_reg_reg((inst), 0x0f, 0x10, (dreg), (sreg)); \
+       } while(0)
+
+#define x86_64_movups_regp_reg(inst, dregp, sreg) \
+       do { \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x11, (sreg), (dregp)); \
+       } while(0)
+
+#define x86_64_movups_mem_reg(inst, mem, sreg) \
+       do { \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x11, (sreg), (mem)); \
+       } while(0)
+
+#define x86_64_movups_membase_reg(inst, basereg, disp, sreg) \
+       do { \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x11, (sreg), (basereg), 
(disp)); \
+       } while(0)
+
+#define x86_64_movups_memindex_reg(inst, basereg, disp, indexreg, shift, sreg) 
\
+       do { \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x11, (sreg), (basereg), 
(disp), (indexreg), (shift)); \
+       } while(0)
+
+#define x86_64_movups_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x10, (dreg), (sregp)); \
+       } while(0)
+
+#define x86_64_movups_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x10, (dreg), (mem)); \
+       } while(0)
+
+#define x86_64_movups_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x10, (dreg), (basereg), 
(disp)); \
+       } while(0)
+
+#define x86_64_movups_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) 
\
+       do { \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x10, (dreg), (basereg), 
(disp), (indexreg), (shift)); \
+       } while(0)
+
+/*
+ * movsd: Move scalar double (64bit float)
  */
 #define x86_64_movsd_reg_reg(inst, dreg, sreg) \
        do { \
-               x86_64_p1_xmm2_reg_reg((inst), 0xf2, 0x0f, 0x10, (dreg), 
(sreg)); \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x10, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_movsd_regp_reg(inst, dregp, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x11, (sreg), 
(dregp), 0); \
+       } while(0)
+
+#define x86_64_movsd_mem_reg(inst, mem, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x11, (sreg), 
(mem), 0); \
        } while(0)
 
 #define x86_64_movsd_membase_reg(inst, basereg, disp, sreg) \
        do { \
-               x86_64_p1_xmm2_reg_membase((inst), 0xf2, 0x0f, 0x11, (sreg), 
(basereg), (disp)); \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x11, 
(sreg), (basereg), (disp), 0); \
        } while(0)
 
 #define x86_64_movsd_memindex_reg(inst, basereg, disp, indexreg, shift, sreg) \
        do { \
-               x86_64_p1_xmm2_reg_memindex((inst), 0xf2, 0x0f, 0x11, (sreg), 
(basereg), (disp), (indexreg), (shift)); \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x11, 
(sreg), (basereg), (disp), (indexreg), (shift), 0); \
        } while(0)
 
-#define x86_64_movsd_regp_reg(inst, dregp, sreg) \
+#define x86_64_movsd_reg_regp(inst, dreg, sregp) \
        do { \
-               x86_64_p1_xmm2_reg_regp((inst), 0xf2, 0x0f, 0x11, (sreg), 
(dregp)); \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x10, (dreg), 
(sregp), 0); \
        } while(0)
 
-#define x86_64_movsd_reg_regp(inst, dreg, sregp) \
+#define x86_64_movsd_reg_mem(inst, dreg, mem) \
        do { \
-               x86_64_p1_xmm2_reg_regp((inst), 0xf2, 0x0f, 0x10, (dreg), 
(sregp)); \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x10, (dreg), 
(mem), 0); \
        } while(0)
 
 #define x86_64_movsd_reg_membase(inst, dreg, basereg, disp) \
        do { \
-               x86_64_p1_xmm2_reg_membase((inst), 0xf2, 0x0f, 0x10, (dreg), 
(basereg), (disp)); \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x10, 
(dreg), (basereg), (disp), 0); \
        } while(0)
 
 #define x86_64_movsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
        do { \
-               x86_64_p1_xmm2_reg_memindex((inst), 0xf2, 0x0f, 0x10, (dreg), 
(basereg), (disp), (indexreg), (shift)); \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x10, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * movss: Move scalar single (32bit float)
+ */
+#define x86_64_movss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x10, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_movss_regp_reg(inst, dregp, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x11, (sreg), 
(dregp), 0); \
+       } while(0)
+
+#define x86_64_movss_mem_reg(inst, mem, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x11, (sreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_movss_membase_reg(inst, basereg, disp, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x11, 
(sreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_movss_memindex_reg(inst, basereg, disp, indexreg, shift, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x11, 
(sreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+#define x86_64_movss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x10, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_movss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x10, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_movss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x10, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_movss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x10, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * Conversion opcodes
+ */
+
+/*
+ * cvtsi2ss: Convert signed integer to float32
+ * The size is the size of the integer value (4 or 8)
+ */
+#define x86_64_cvtsi2ss_reg_reg_size(inst, dxreg, sreg, size) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x2a, (dxreg), 
(sreg), (size)); \
+       } while(0)
+
+#define x86_64_cvtsi2ss_reg_regp_size(inst, dxreg, sregp, size) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x2a, (dxreg), 
(sregp), (size)); \
+       } while(0)
+
+#define x86_64_cvtsi2ss_reg_mem_size(inst, dxreg, mem, size) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x2a, (dxreg), 
(mem), (size)); \
+       } while(0)
+
+#define x86_64_cvtsi2ss_reg_membase_size(inst, dreg, basereg, disp, size) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x2a, 
(dreg), (basereg), (disp), (size)); \
+       } while(0)
+
+#define x86_64_cvtsi2ss_reg_memindex_size(inst, dreg, basereg, disp, indexreg, 
shift, size) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x2a, 
(dreg), (basereg), (disp), (indexreg), (shift), (size)); \
+       } while(0)
+
+/*
+ * cvtsi2sd: Convert signed integer to float64
+ * The size is the size of the integer value (4 or 8)
+ */
+#define x86_64_cvtsi2sd_reg_reg_size(inst, dxreg, sreg, size) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x2a, (dxreg), 
(sreg), (size)); \
+       } while(0)
+
+#define x86_64_cvtsi2sd_reg_regp_size(inst, dxreg, sregp, size) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x2a, (dxreg), 
(sregp), (size)); \
+       } while(0)
+
+#define x86_64_cvtsi2sd_reg_mem_size(inst, dxreg, mem, size) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x2a, (dxreg), 
(mem), (size)); \
+       } while(0)
+
+#define x86_64_cvtsi2sd_reg_membase_size(inst, dreg, basereg, disp, size) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x2a, 
(dreg), (basereg), (disp), (size)); \
+       } while(0)
+
+#define x86_64_cvtsi2sd_reg_memindex_size(inst, dreg, basereg, disp, indexreg, 
shift, size) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x2a, 
(dreg), (basereg), (disp), (indexreg), (shift), (size)); \
+       } while(0)
+
+/*
+ * cvtss2sd: Convert float32 to float64
+ */
+#define x86_64_cvtss2sd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x5a, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_cvtss2sd_reg_regp(inst, dxreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x5a, (dxreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_cvtss2sd_reg_mem(inst, dxreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x5a, (dxreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_cvtss2sd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x5a, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_cvtss2sd_reg_memindex(inst, dreg, basereg, disp, indexreg, 
shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x5a, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * cvtsd2ss: Convert float64 to float32
+ */
+#define x86_64_cvtsd2ss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5a, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_cvtsd2ss_reg_regp(inst, dxreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5a, (dxreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_cvtsd2ss_reg_mem(inst, dxreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5a, (dxreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_cvtsd2ss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5a, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_cvtsd2ss_reg_memindex(inst, dreg, basereg, disp, indexreg, 
shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5a, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * Arithmetic opcodes
+ */
+
+/*
+ * addss: Add scalar single precision float values
+ */
+#define x86_64_addss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x58, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_addss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x58, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_addss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x58, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_addss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x58, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_addss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x58, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * subss: Substract scalar single precision float values
+ */
+#define x86_64_subss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x5c, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_subss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x5c, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_subss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x5c, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_subss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x5c, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_subss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x5c, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * mulss: Multiply scalar single precision float values
+ */
+#define x86_64_mulss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x59, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_mulss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x59, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_mulss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x59, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_mulss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x59, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_mulss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x59, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * divss: Divide scalar single precision float values
+ */
+#define x86_64_divss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x5e, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_divss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x5e, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_divss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x5e, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_divss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x5e, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_divss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x5e, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * addsd: Add scalar double precision float values
+ */
+#define x86_64_addsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_addsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_addsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_addsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x58, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_addsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x58, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * subsd: Substract scalar double precision float values
+ */
+#define x86_64_subsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_subsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_subsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_subsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5c, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_subsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5c, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * mulsd: Multiply scalar double precision float values
+ */
+#define x86_64_mulsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_mulsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_mulsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_mulsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x59, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_mulsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x59, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * divsd: Divide scalar double precision float values
+ */
+#define x86_64_divsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_divsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_divsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_divsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5e, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_divsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5e, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * fpu instructions
+ */
+
+/*
+ * fld
+ */
+
+#define x86_64_fld_regp_size(inst, sregp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_regp_emit((inst), 0, (sregp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_regp_emit((inst), 0, (sregp)); \
+                       } \
+                       break; \
+                       case 10: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_regp_emit((inst), 5, (sregp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fld_mem_size(inst, mem, size) \
+       do { \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_mem_emit((inst), 0, (mem)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_mem_emit((inst), 0, (mem)); \
+                       } \
+                       break; \
+                       case 10: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_mem_emit((inst), 5, (mem)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fld_membase_size(inst, basereg, disp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_membase_emit((inst), 0, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_membase_emit((inst), 0, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 10: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_membase_emit((inst), 5, (basereg), 
(disp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fld_memindex_size(inst, basereg, disp, indexreg, shift, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, (indexreg), (basereg)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_memindex_emit((inst), 0, (basereg), 
(disp), (indexreg), (shift)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_memindex_emit((inst), 0, (basereg), 
(disp), (indexreg), (shift)); \
+                       } \
+                       break; \
+                       case 10: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_memindex_emit((inst), 5, (basereg), 
(disp), (indexreg), (shift)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+/*
+ * fild: Load an integer and convert it to long double
+ */
+#define x86_fild_mem_size(inst, mem, size) \
+       do { \
+               switch(size) \
+               { \
+                       case 2: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_mem_emit((inst), 0, (mem));      \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_mem_emit((inst), 0, (mem));      \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_mem_emit((inst), 5, (mem));      \
+                       } \
+                       break; \
+               } \
+       } while (0)
+
+#define x86_fild_membase_size(inst, mem, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \
+               switch(size) \
+               { \
+                       case 2: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_membase_emit((inst), 0, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_membase_emit((inst), 0, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_membase_emit((inst), 5, (basereg), 
(disp)); \
+                       } \
+                       break; \
+               } \
+       } while (0)
+
+/*
+ * fst: Store fpu register to memory (only float32 and float64 allowed)
+ */
+
+#define x86_64_fst_regp_size(inst, sregp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_regp_emit((inst), 2, (sregp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_regp_emit((inst), 2, (sregp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fst_mem_size(inst, mem, size) \
+       do { \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_mem_emit((inst), 2, (mem)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_mem_emit((inst), 2, (mem)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fst_membase_size(inst, basereg, disp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_membase_emit((inst), 2, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_membase_emit((inst), 2, (basereg), 
(disp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fst_memindex_size(inst, basereg, disp, indexreg, shift, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, (indexreg), (basereg)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_memindex_emit((inst), 2, (basereg), 
(disp), (indexreg), (shift)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_memindex_emit((inst), 2, (basereg), 
(disp), (indexreg), (shift)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+/*
+ * fstp: store top fpu register to memory and pop it from the fpu stack
+ */
+
+#define x86_64_fstp_regp_size(inst, sregp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_regp_emit((inst), 3, (sregp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_regp_emit((inst), 3, (sregp)); \
+                       } \
+                       break; \
+                       case 10: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_regp_emit((inst), 7, (sregp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fstp_mem_size(inst, mem, size) \
+       do { \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_mem_emit((inst), 3, (mem)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_mem_emit((inst), 3, (mem)); \
+                       } \
+                       break; \
+                       case 10: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_mem_emit((inst), 7, (mem)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fstp_membase_size(inst, basereg, disp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_membase_emit((inst), 3, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_membase_emit((inst), 3, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 10: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_membase_emit((inst), 7, (basereg), 
(disp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fstp_memindex_size(inst, basereg, disp, indexreg, shift, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, (indexreg), (basereg)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
+                           *(inst)++ = (unsigned char)0xd9; \
+                               x86_64_memindex_emit((inst), 3, (basereg), 
(disp), (indexreg), (shift)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_memindex_emit((inst), 3, (basereg), 
(disp), (indexreg), (shift)); \
+                       } \
+                       break; \
+                       case 10: \
+                       { \
+                           *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_memindex_emit((inst), 7, (basereg), 
(disp), (indexreg), (shift)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+/*
+ * Convert long double to integer
+ */
+#define x86_64_fistp_mem_size(inst, mem, size) \
+       do { \
+               switch((size)) \
+               { \
+                       case 2: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_mem_emit((inst), 3, (mem)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_mem_emit((inst), 3, (mem)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_mem_emit((inst), 7, (mem)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fistp_membase_size(inst, basereg, disp, size) \
+       do { \
+               switch((size)) \
+               { \
+                       case 2: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_membase_emit((inst), 3, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_membase_emit((inst), 3, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_membase_emit((inst), 7, (basereg), 
(disp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+/*
+ * Store fpu control word after checking for pending unmasked fpu exceptions
+ */
+#define x86_64_fnstcw(inst, mem) \
+       do { \
+               *(inst)++ = (unsigned char)0xd9; \
+               x86_64_mem_emit((inst), 7, (mem)); \
+       } while (0)
+
+#define x86_64_fnstcw_membase(inst, basereg, disp) \
+       do { \
+               *(inst)++ = (unsigned char)0xd9; \
+               x86_64_membase_emit((inst), 7, (basereg), (disp)); \
+       } while(0)
+
+/*
+ * Load fpu control word
+ */
+#define x86_64_fldcw(inst, mem) \
+       do { \
+               *(inst)++ = (unsigned char)0xd9; \
+               x86_64_mem_emit((inst), 5, (mem)); \
+       } while(0)
+
+#define x86_64_fldcw_membase(inst, basereg, disp) \
+       do { \
+               *(inst)++ = (unsigned char)0xd9; \
+               x86_64_membase_emit ((inst), 5, (basereg), (disp)); \
        } while(0)
 
 #ifdef __cplusplus

Index: jit/jit-insn.c
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-insn.c,v
retrieving revision 1.58
retrieving revision 1.59
diff -u -b -r1.58 -r1.59
--- jit/jit-insn.c      24 Jan 2008 20:12:52 -0000      1.58
+++ jit/jit-insn.c      2 Mar 2008 17:07:06 -0000       1.59
@@ -6206,9 +6206,6 @@
                 JIT_CALL_NORETURN);
        jit_type_free(signature);
 
-       /* The "jit_exception_builtin" function will never return */
-       func->builder->current_block->ends_in_dead = 1;
-
        /* Execution continues here if there was no exception */
        if(!jit_insn_label(func, &label))
        {

Index: jit/jit-rules.h
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-rules.h,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -b -r1.16 -r1.17
--- jit/jit-rules.h     24 Jan 2008 20:12:54 -0000      1.16
+++ jit/jit-rules.h     2 Mar 2008 17:07:06 -0000       1.17
@@ -42,6 +42,9 @@
 #elif defined(__alpha) || defined(__alpha__)
        #define JIT_BACKEND_ALPHA               1
        #define JIT_HAVE_BACKEND                1
+#elif defined(__amd64) || defined(__amd64__) || defined(_x86_64) || 
defined(_x86_64__)
+       #define JIT_BACKEND_X86_64              1
+       #define JIT_HAVE_BACKEND                1
 #endif
 /*#define      JIT_BACKEND_ARM         1*/
 #if !defined(JIT_HAVE_BACKEND)
@@ -88,6 +91,8 @@
        #include "jit-rules-arm.h"
 #elif defined(JIT_BACKEND_ALPHA)
        #include "jit-rules-alpha.h"
+#elif defined(JIT_BACKEND_X86_64)
+       #include "jit-rules-x86-64.h"
 #else
        #error "unknown jit backend type"
 #endif

Index: jit/jit-value.c
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-value.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -b -r1.13 -r1.14
--- jit/jit-value.c     24 Jan 2008 20:12:54 -0000      1.13
+++ jit/jit-value.c     2 Mar 2008 17:07:06 -0000       1.14
@@ -447,8 +447,13 @@
 
                case JIT_TYPE_LONG:
                case JIT_TYPE_ULONG:
+#ifdef JIT_NATIVE_INT64
+                       return jit_value_create_nint_constant
+                               (func, const_value->type, 
const_value->un.long_value);
+#else
                        return jit_value_create_long_constant
                                (func, const_value->type, 
const_value->un.long_value);
+#endif
 
                case JIT_TYPE_FLOAT32:
                        return jit_value_create_float32_constant

Index: jit/Makefile.am
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/Makefile.am,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -b -r1.23 -r1.24
--- jit/Makefile.am     30 Jan 2008 19:26:46 -0000      1.23
+++ jit/Makefile.am     2 Mar 2008 17:07:06 -0000       1.24
@@ -62,6 +62,8 @@
                jit-rules-arm.c \
                jit-rules-x86.h \
                jit-rules-x86.c \
+               jit-rules-x86-64.h \
+               jit-rules-x86-64.c \
                jit-setjmp.h \
                jit-signal.c \
                jit-string.c \
@@ -76,7 +78,8 @@
                mklabel.sh \
                jit-rules-alpha.ins \
                jit-rules-arm.sel \
-               jit-rules-x86.ins
+               jit-rules-x86.ins \
+               jit-rules-x86-64.ins
 
 AM_CFLAGS = -I$(top_srcdir)/include -I$(top_builddir)/include -I. -I$(srcdir)
 
@@ -108,6 +111,12 @@
        $(top_builddir)/tools/gen-rules$(EXEEXT) $(srcdir)/jit-rules-alpha.ins \
                        >jit-rules-alpha.inc
 
+jit-rules-x86-64.lo: jit-rules-x86-64.inc
+
+jit-rules-x86-64.inc: jit-rules-x86-64.ins 
$(top_builddir)/tools/gen-rules$(EXEEXT)
+       $(top_builddir)/tools/gen-rules$(EXEEXT) $(srcdir)/jit-rules-x86-64.ins 
\
+                       >jit-rules-x86-64.inc
+
 CLEANFILES = \
        jit-interp-labels.h \
        jit-rules-alpha.inc \

Index: jit/jit-rules-x86-64.c
===================================================================
RCS file: jit/jit-rules-x86-64.c
diff -N jit/jit-rules-x86-64.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ jit/jit-rules-x86-64.c      2 Mar 2008 17:07:06 -0000       1.1
@@ -0,0 +1,3549 @@
+/*
+ * jit-rules-x86-64.c - Rules that define the characteristics of the x86_64.
+ *
+ * Copyright (C) 2008  Southern Storm Software, Pty Ltd.
+ *
+ * This file is part of the libjit library.
+ *
+ * The libjit library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation, either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * The libjit library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with the libjit library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "jit-internal.h"
+#include "jit-rules.h"
+#include "jit-apply-rules.h"
+
+#if defined(JIT_BACKEND_X86_64)
+
+#include "jit-gen-x86-64.h"
+#include "jit-reg-alloc.h"
+#include "jit-setjmp.h"
+#include <stdio.h>
+
+/*
+ * Pseudo register numbers for the x86_64 registers. These are not the
+ * same as the CPU instruction register numbers.  The order of these
+ * values must match the order in "JIT_REG_INFO".
+ */
+#define        X86_64_REG_RAX          0
+#define        X86_64_REG_RCX          1
+#define        X86_64_REG_RDX          2
+#define        X86_64_REG_RBX          3
+#define        X86_64_REG_RSI          4
+#define        X86_64_REG_RDI          5
+#define        X86_64_REG_R8           6
+#define        X86_64_REG_R9           7
+#define        X86_64_REG_R10          8
+#define        X86_64_REG_R11          9
+#define        X86_64_REG_R12          10
+#define        X86_64_REG_R13          11
+#define        X86_64_REG_R14          12
+#define        X86_64_REG_R15          13
+#define        X86_64_REG_RBP          14
+#define        X86_64_REG_RSP          15
+#define        X86_64_REG_XMM0         16
+#define        X86_64_REG_XMM1         17
+#define        X86_64_REG_XMM2         18
+#define        X86_64_REG_XMM3         19
+#define        X86_64_REG_XMM4         20
+#define        X86_64_REG_XMM5         21
+#define        X86_64_REG_XMM6         22
+#define        X86_64_REG_XMM7         23
+#define        X86_64_REG_XMM8         24
+#define        X86_64_REG_XMM9         25
+#define        X86_64_REG_XMM10        26
+#define        X86_64_REG_XMM11        27
+#define        X86_64_REG_XMM12        28
+#define        X86_64_REG_XMM13        29
+#define        X86_64_REG_XMM14        30
+#define        X86_64_REG_XMM15        31
+#define        X86_64_REG_ST0          32
+#define        X86_64_REG_ST1          33
+#define        X86_64_REG_ST2          34
+#define        X86_64_REG_ST3          35
+#define        X86_64_REG_ST4          36
+#define        X86_64_REG_ST5          37
+#define        X86_64_REG_ST6          38
+#define        X86_64_REG_ST7          39
+
+/*
+ * Determine if a pseudo register number is general, xmm or fpu.
+ */
+#define        IS_GENERAL_REG(reg)     (((reg) & ~0x0f) == 0)
+#define        IS_XMM_REG(reg)         (((reg) & ~0x0f) == 0x10)
+#define        IS_FPU_REG(reg)         (((reg) & ~0x0f) == 0x20)
+
+/*
+ * Scratch register, that is used for calls via register and
+ * for loading the exception pc to the setjmp buffer.
+ * This register *MUST* not be used for parameter passing and
+ * *MUST* not be a callee saved register.
+ * For SysV abi R11 is perfect.
+ */
+#define X86_64_SCRATCH X86_64_R11
+
+/*
+ * Set this definition to 1 if the OS supports the SysV red zone.
+ * This is a 128 byte area below the stack pointer that is guaranteed
+ * to be not modified by interrupts or signal handlers.
+ * This allows us to use a temporary area on the stack without
+ * having to modify the stack pointer saving us two instructions.
+ * TODO: Make this a configure switch.
+ */
+#define HAVE_RED_ZONE 1
+
+/*
+ * X86_64 argument types as specified in the X86_64 SysV ABI.
+ */
+#define X86_64_ARG_NO_CLASS            0x00
+#define X86_64_ARG_INTEGER             0x01
+#define X86_64_ARG_MEMORY              0x02
+#define X86_64_ARG_SSE                 0x11
+#define X86_64_ARG_SSEUP               0x12
+#define X86_64_ARG_X87                 0x21
+#define X86_64_ARG_X87UP               0x22
+
+#define X86_64_ARG_IS_SSE(arg) (((arg) & 0x10) != 0)
+#define X86_64_ARG_IS_X87(arg) (((arg) & 0x20) != 0)
+
+/*
+ * The granularity of the stack
+ */
+#define STACK_SLOT_SIZE        sizeof(void *)
+
+/*
+ * Get he number of complete stack slots used
+ */
+#define STACK_SLOTS_USED(size) ((size) >> 3)
+
+/*
+ * Round a size up to a multiple of the stack word size.
+ */
+#define        ROUND_STACK(size)       \
+               (((size) + (STACK_SLOT_SIZE - 1)) & ~(STACK_SLOT_SIZE - 1))
+
+/*
+ * Setup or teardown the x86 code output process.
+ */
+#define        jit_cache_setup_output(needed)  \
+       unsigned char *inst = gen->posn.ptr; \
+       if(!jit_cache_check_for_n(&(gen->posn), (needed))) \
+       { \
+               jit_cache_mark_full(&(gen->posn)); \
+               return; \
+       }
+#define        jit_cache_end_output()  \
+       gen->posn.ptr = inst
+
+/*
+ * Set this to 1 for debugging fixups
+ */
+#define DEBUG_FIXUPS 0
+
+/*
+ * The maximum block size copied inline
+ */
+#define _JIT_MAX_MEMCPY_INLINE 0x40
+
+/*
+ * va_list type as specified in x86_64 sysv abi version 0.99
+ * Figure 3.34
+ */
+typedef struct
+{
+       unsigned int gp_offset;
+       unsigned int fp_offset;
+       void *overflow_arg_area;
+       void *reg_save_area;
+} _jit_va_list;
+
+/* Registers used for INTEGER arguments */
+static int _jit_word_arg_regs[] = {X86_64_REG_RDI, X86_64_REG_RSI,
+                                                                  
X86_64_REG_RDX, X86_64_REG_RCX,
+                                                                  
X86_64_REG_R8, X86_64_REG_R9};
+#define _jit_num_word_regs     6
+
+/* Registers used for float arguments */
+static int _jit_float_arg_regs[] = {X86_64_REG_XMM0, X86_64_REG_XMM1,
+                                                                       
X86_64_REG_XMM2, X86_64_REG_XMM3,
+                                                                       
X86_64_REG_XMM4, X86_64_REG_XMM5,
+                                                                       
X86_64_REG_XMM6, X86_64_REG_XMM7};
+#define _jit_num_float_regs    8
+
+/* Registers used for returning INTEGER values */
+static int _jit_word_return_regs[] = {X86_64_REG_RAX, X86_64_REG_RDX};
+#define _jit_num_word_return_regs      2
+
+/* Registers used for returning sse values */
+static int _jit_sse_return_regs[] = {X86_64_REG_XMM0, X86_64_REG_XMM1};
+#define _jit_num_sse_return_regs       2
+
+/*
+ * X86_64 register classes
+ */
+static _jit_regclass_t *x86_64_reg;            /* X86_64 general purpose 
registers */
+static _jit_regclass_t *x86_64_creg;   /* X86_64 call clobbered general */
+                                                                               
/* purpose registers */
+static _jit_regclass_t *x86_64_rreg;   /* general purpose registers not used*/
+                                                                               
/* for returning values */
+static _jit_regclass_t *x86_64_freg;   /* X86_64 fpu registers */
+static _jit_regclass_t *x86_64_xreg;   /* X86_64 xmm registers */
+
+void
+_jit_init_backend(void)
+{
+       x86_64_reg = _jit_regclass_create(
+               "reg", JIT_REG_WORD | JIT_REG_LONG, 14,
+               X86_64_REG_RAX, X86_64_REG_RCX,
+               X86_64_REG_RDX, X86_64_REG_RBX,
+               X86_64_REG_RSI, X86_64_REG_RDI,
+               X86_64_REG_R8, X86_64_REG_R9,
+               X86_64_REG_R10, X86_64_REG_R11,
+               X86_64_REG_R12, X86_64_REG_R13,
+               X86_64_REG_R14, X86_64_REG_R15);
+
+       /* register class with all call clobbered registers */
+       x86_64_creg = _jit_regclass_create(
+               "creg", JIT_REG_WORD | JIT_REG_LONG, 9,
+               X86_64_REG_RAX, X86_64_REG_RCX,
+               X86_64_REG_RDX, X86_64_REG_RSI,
+               X86_64_REG_RDI, X86_64_REG_R8,
+               X86_64_REG_R9, X86_64_REG_R10,
+               X86_64_REG_R11);
+
+       /* register class with all registers not used for returning values */
+       x86_64_rreg = _jit_regclass_create(
+               "rreg", JIT_REG_WORD | JIT_REG_LONG, 12,
+               X86_64_REG_RCX, X86_64_REG_RBX,
+               X86_64_REG_RSI, X86_64_REG_RDI,
+               X86_64_REG_R8, X86_64_REG_R9,
+               X86_64_REG_R10, X86_64_REG_R11,
+               X86_64_REG_R12, X86_64_REG_R13,
+               X86_64_REG_R14, X86_64_REG_R15);
+
+       x86_64_freg = _jit_regclass_create(
+               "freg", JIT_REG_X86_64_FLOAT | JIT_REG_IN_STACK, 8,
+               X86_64_REG_ST0, X86_64_REG_ST1,
+               X86_64_REG_ST2, X86_64_REG_ST3,
+               X86_64_REG_ST4, X86_64_REG_ST5,
+               X86_64_REG_ST6, X86_64_REG_ST7);
+
+       x86_64_xreg = _jit_regclass_create(
+               "xreg", JIT_REG_FLOAT32 | JIT_REG_FLOAT64, 16,
+               X86_64_REG_XMM0, X86_64_REG_XMM1,
+               X86_64_REG_XMM2, X86_64_REG_XMM3,
+               X86_64_REG_XMM4, X86_64_REG_XMM5,
+               X86_64_REG_XMM6, X86_64_REG_XMM7,
+               X86_64_REG_XMM8, X86_64_REG_XMM9,
+               X86_64_REG_XMM10, X86_64_REG_XMM11,
+               X86_64_REG_XMM12, X86_64_REG_XMM13,
+               X86_64_REG_XMM14, X86_64_REG_XMM15);
+}
+
+int
+_jit_opcode_is_supported(int opcode)
+{
+       switch(opcode)
+       {
+       #define JIT_INCLUDE_SUPPORTED
+       #include "jit-rules-x86-64.inc"
+       #undef JIT_INCLUDE_SUPPORTED
+       }
+       return 0;
+}
+
+int
+_jit_setup_indirect_pointer(jit_function_t func, jit_value_t value)
+{
+       return jit_insn_outgoing_reg(func, value, X86_64_REG_R11);
+}
+
+/*
+ * Do a xmm operation with a constant float32 value
+ */
+static int
+_jit_xmm1_reg_imm_size_float32(jit_gencode_t gen, unsigned char **inst_ptr,
+                                                          X86_64_XMM1_OP opc, 
int reg,
+                                                          jit_float32 
*float32_value)
+{
+       void *ptr;
+       jit_nint offset;
+       unsigned char *inst;
+
+       inst = *inst_ptr;
+       ptr = _jit_cache_alloc(&(gen->posn), sizeof(jit_float32));
+       if(!ptr)
+       {
+               return 0;
+       }
+       jit_memcpy(ptr, float32_value, sizeof(jit_float32));
+
+       offset = (jit_nint)ptr - ((jit_nint)inst + (reg > 7 ? 9 : 8));
+       if((offset >= jit_min_int) && (offset <= jit_max_int))
+       {
+               /* We can use RIP relative addressing here */
+               x86_64_xmm1_reg_membase(inst, opc, reg,
+                                                                        
X86_64_RIP, offset, 0);
+       }
+       else if(((jit_nint)ptr >= jit_min_int) &&
+                       ((jit_nint)ptr <= jit_max_int))
+       {
+               /* We can use absolute addressing */
+               x86_64_xmm1_reg_mem(inst, opc, reg, (jit_nint)ptr, 0);
+       }
+       else
+       {
+               /* We have to use an extra general register */
+               /* TODO */
+               return 0;
+       }
+       *inst_ptr = inst;
+       return 1;
+}
+
+/*
+ * Do a xmm operation with a constant float64 value
+ */
+static int
+_jit_xmm1_reg_imm_size_float64(jit_gencode_t gen, unsigned char **inst_ptr,
+                                                          X86_64_XMM1_OP opc, 
int reg,
+                                                          jit_float64 
*float64_value)
+{
+       void *ptr;
+       jit_nint offset;
+       unsigned char *inst;
+
+       inst = *inst_ptr;
+       ptr = _jit_cache_alloc(&(gen->posn), sizeof(jit_float64));
+       if(!ptr)
+       {
+               return 0;
+       }
+       jit_memcpy(ptr, float64_value, sizeof(jit_float64));
+
+       offset = (jit_nint)ptr - ((jit_nint)inst + (reg > 7 ? 9 : 8));
+       if((offset >= jit_min_int) && (offset <= jit_max_int))
+       {
+               /* We can use RIP relative addressing here */
+               x86_64_xmm1_reg_membase(inst, opc, reg,
+                                                                        
X86_64_RIP, offset, 1);
+       }
+       else if(((jit_nint)ptr >= jit_min_int) &&
+                       ((jit_nint)ptr <= jit_max_int))
+       {
+               /* We can use absolute addressing */
+               x86_64_xmm1_reg_mem(inst, opc, reg, (jit_nint)ptr, 1);
+       }
+       else
+       {
+               /* We have to use an extra general register */
+               /* TODO */
+               return 0;
+       }
+       *inst_ptr = inst;
+       return 1;
+}
+
+/*
+ * Call a function
+ */
+static unsigned char *
+x86_64_call_code(unsigned char *inst, jit_nint func)
+{
+       jit_nint offset;
+
+       offset = func - ((jit_nint)inst + 5);
+       if(offset >= jit_min_int && offset <= jit_max_int)
+       {
+               /* We can use the immediate call */
+               x86_64_call_imm(inst, offset);
+       }
+       else
+       {
+               /* We have to do a call via register */
+               x86_64_mov_reg_imm_size(inst, X86_64_SCRATCH, func, 8);
+               x86_64_call_reg(inst, X86_64_SCRATCH);
+       }
+       return inst;
+}
+
+/*
+ * Jump to a function
+ */
+static unsigned char *
+x86_64_jump_to_code(unsigned char *inst, jit_nint func)
+{
+       jit_nint offset;
+
+       offset = func - ((jit_nint)inst + 5);
+       if(offset >= jit_min_int && offset <= jit_max_int)
+       {
+               /* We can use the immediate call */
+               x86_64_jmp_imm(inst, offset);
+       }
+       else
+       {
+               /* We have to do a call via register */
+               x86_64_mov_reg_imm_size(inst, X86_64_SCRATCH, func, 8);
+               x86_64_jmp_reg(inst, X86_64_SCRATCH);
+       }
+       return inst;
+}
+
+/*
+ * Throw a builtin exception.
+ */
+static unsigned char *
+throw_builtin(unsigned char *inst, jit_function_t func, int type)
+{
+       /* We need to update "catch_pc" if we have a "try" block */
+       if(func->builder->setjmp_value != 0)
+       {
+               _jit_gen_fix_value(func->builder->setjmp_value);
+
+               x86_64_lea_membase_size(inst, X86_64_RDI, X86_64_RIP, 0, 8);
+               x86_64_mov_membase_reg_size(inst, X86_64_RBP, 
+                                       
func->builder->setjmp_value->frame_offset
+                                       + jit_jmp_catch_pc_offset, X86_64_RDI, 
8);
+       }
+
+       /* Push the exception type onto the stack */
+       x86_64_mov_reg_imm_size(inst, X86_64_RDI, type, 4);
+
+       /* Call the "jit_exception_builtin" function, which will never return */
+       return x86_64_call_code(inst, (jit_nint)jit_exception_builtin);
+}
+
+/*
+ * spill a register to it's place in the current stack frame.
+ * The argument type must be in it's normalized form.
+ */
+static void
+_spill_reg(unsigned char **inst_ptr, jit_type_t type,
+                  jit_int reg, jit_int offset)
+{
+       unsigned char *inst = *inst_ptr;
+
+       if(IS_GENERAL_REG(reg))
+       {
+               switch(type->kind)
+               {
+                       case JIT_TYPE_SBYTE:
+                       case JIT_TYPE_UBYTE:
+                       {
+                               x86_64_mov_membase_reg_size(inst, X86_64_RBP, 
offset,
+                                                                               
        _jit_reg_info[reg].cpu_reg, 1);
+                       }
+                       break;
+
+                       case JIT_TYPE_SHORT:
+                       case JIT_TYPE_USHORT:
+                       {
+                               x86_64_mov_membase_reg_size(inst, X86_64_RBP, 
offset,
+                                                                               
        _jit_reg_info[reg].cpu_reg, 2);
+                       }
+                       break;
+
+                       case JIT_TYPE_INT:
+                       case JIT_TYPE_UINT:
+                       case JIT_TYPE_FLOAT32:
+                       {
+                               x86_64_mov_membase_reg_size(inst, X86_64_RBP, 
offset,
+                                                                               
        _jit_reg_info[reg].cpu_reg, 4);
+                       }
+                       break;
+
+                       case JIT_TYPE_LONG:
+                       case JIT_TYPE_ULONG:
+                       case JIT_TYPE_FLOAT64:
+                       {
+                               x86_64_mov_membase_reg_size(inst, X86_64_RBP, 
offset,
+                                                                               
        _jit_reg_info[reg].cpu_reg, 8);
+                       }
+                       break;
+
+                       case JIT_TYPE_STRUCT:
+                       case JIT_TYPE_UNION:
+                       {
+                               jit_nuint size = jit_type_get_size(type);
+
+                               if(size == 1)
+                               {
+                                       x86_64_mov_membase_reg_size(inst, 
X86_64_RBP, offset,
+                                                                               
                _jit_reg_info[reg].cpu_reg, 1);
+                               }
+                               else if(size == 2)
+                               {
+                                       x86_64_mov_membase_reg_size(inst, 
X86_64_RBP, offset,
+                                                                               
                _jit_reg_info[reg].cpu_reg, 2);
+                               }
+                               else if(size <= 4)
+                               {
+                                       x86_64_mov_membase_reg_size(inst, 
X86_64_RBP, offset,
+                                                                               
                _jit_reg_info[reg].cpu_reg, 4);
+                               }
+                               else
+                               {
+                                       x86_64_mov_membase_reg_size(inst, 
X86_64_RBP, offset,
+                                                                               
                _jit_reg_info[reg].cpu_reg, 8);
+                               }
+                       }
+               }
+       }
+       else if(IS_XMM_REG(reg))
+       {
+               switch(type->kind)
+               {
+                       case JIT_TYPE_FLOAT32:
+                       {
+                               x86_64_movss_membase_reg(inst, X86_64_RBP, 
offset,
+                                                                               
 _jit_reg_info[reg].cpu_reg);
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT64:
+                       {
+                               x86_64_movsd_membase_reg(inst, X86_64_RBP, 
offset,
+                                                                               
 _jit_reg_info[reg].cpu_reg);
+                       }
+                       break;
+
+                       case JIT_TYPE_STRUCT:
+                       case JIT_TYPE_UNION:
+                       {
+                               jit_nuint size = jit_type_get_size(type);
+
+                               if(size <= 4)
+                               {
+                                       x86_64_movss_membase_reg(inst, 
X86_64_RBP, offset,
+                                                                               
         _jit_reg_info[reg].cpu_reg);
+                               }
+                               else if(size <= 8)
+                               {
+                                       x86_64_movsd_membase_reg(inst, 
X86_64_RBP, offset,
+                                                                               
         _jit_reg_info[reg].cpu_reg);
+                               }
+                               else
+                               {
+                                       jit_nint alignment = 
jit_type_get_alignment(type);
+
+                                       if((alignment & 0xf) == 0)
+                                       {
+                                               x86_64_movaps_membase_reg(inst, 
X86_64_RBP, offset,
+                                                                               
                  _jit_reg_info[reg].cpu_reg);
+                                       }
+                                       else
+                                       {
+                                               x86_64_movups_membase_reg(inst, 
X86_64_RBP, offset,
+                                                                               
                  _jit_reg_info[reg].cpu_reg);
+                                       }
+                               }
+                       }
+                       break;
+               }
+       }
+       else if(IS_FPU_REG(reg))
+       {
+               switch(type->kind)
+               {
+                       case JIT_TYPE_FLOAT32:
+                       {
+                               x86_64_fstp_membase_size(inst, X86_64_RBP, 
offset, 4);
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT64:
+                       {
+                               x86_64_fstp_membase_size(inst, X86_64_RBP, 
offset, 8);
+                       }
+                       break;
+
+                       case JIT_TYPE_NFLOAT:
+                       {
+                               if(sizeof(jit_nfloat) == sizeof(jit_float64))
+                               {
+                                       x86_64_fstp_membase_size(inst, 
X86_64_RBP, offset, 8);
+                               }
+                               else
+                               {
+                                       x86_64_fstp_membase_size(inst, 
X86_64_RBP, offset, 10);
+                               }
+                       }
+                       break;
+               }
+       }
+
+       /* Write the current instruction pointer back */
+       *inst_ptr = inst;
+}
+
+void
+_jit_gen_fix_value(jit_value_t value)
+{
+       if(!(value->has_frame_offset) && !(value->is_constant))
+       {
+               jit_nuint alignment = jit_type_get_alignment(value->type);
+               jit_nint size =jit_type_get_size(value->type);
+               jit_nint frame_size = value->block->func->builder->frame_size;
+
+               /* Round the size to a multiple of the stack item size */
+               size = (jit_nint)(ROUND_STACK(size));
+
+               /* Add the size to the existing local items */
+               frame_size += size;
+
+               /* Align the new frame_size for the value */
+               frame_size = (frame_size + (alignment - 1)) & ~(alignment - 1);
+
+               value->block->func->builder->frame_size = frame_size;
+               value->frame_offset = -frame_size;
+               value->has_frame_offset = 1;
+       }
+}
+
+void
+_jit_gen_spill_global(jit_gencode_t gen, int reg, jit_value_t value)
+{
+       jit_cache_setup_output(16);
+       if(value)
+       {
+               jit_type_t type = jit_type_normalize(value->type);
+
+               _jit_gen_fix_value(value);
+
+               _spill_reg(&inst, type, value->global_reg, value->frame_offset);
+       }
+       else
+       {
+               x86_64_push_reg_size(inst, _jit_reg_info[reg].cpu_reg, 8);
+       }
+       jit_cache_end_output();
+}
+
+void
+_jit_gen_load_global(jit_gencode_t gen, int reg, jit_value_t value)
+{
+       jit_cache_setup_output(16);
+       if(value)
+       {
+               x86_64_mov_reg_membase_size(inst,
+                       _jit_reg_info[value->global_reg].cpu_reg,
+                       X86_64_RBP, value->frame_offset, 8);
+       }
+       else
+       {
+               x86_64_pop_reg_size(inst, _jit_reg_info[reg].cpu_reg, 8);
+       }
+       jit_cache_end_output();
+}
+
+void
+_jit_gen_spill_reg(jit_gencode_t gen, int reg,
+                                  int other_reg, jit_value_t value)
+{
+       jit_type_t type;
+
+       /* Make sure that we have sufficient space */
+       jit_cache_setup_output(16);
+
+       /* If the value is associated with a global register, then copy to that 
*/
+       if(value->has_global_register)
+       {
+               reg = _jit_reg_info[reg].cpu_reg;
+               other_reg = _jit_reg_info[value->global_reg].cpu_reg;
+               x86_64_mov_reg_reg_size(inst, other_reg, reg, sizeof(void *));
+               jit_cache_end_output();
+               return;
+       }
+
+       /* Fix the value in place within the local variable frame */
+       _jit_gen_fix_value(value);
+
+       /* Get the normalized type */
+       type = jit_type_normalize(value->type);
+
+       /* and spill the register */
+       _spill_reg(&inst, type, reg, value->frame_offset);
+
+       /* End the code output process */
+       jit_cache_end_output();
+}
+
+void
+_jit_gen_free_reg(jit_gencode_t gen, int reg,
+                                 int other_reg, int value_used)
+{
+       /* We only need to take explicit action if we are freeing a
+          floating-point register whose value hasn't been used yet */
+       if(!value_used && IS_FPU_REG(reg))
+       {
+               if(jit_cache_check_for_n(&(gen->posn), 2))
+               {
+                       x86_fstp(gen->posn.ptr, reg - X86_64_REG_ST0);
+               }
+               else
+               {
+                       jit_cache_mark_full(&(gen->posn));
+               }
+       }
+}
+
+/*
+ * Set a register value based on a condition code.
+ */
+static unsigned char *
+setcc_reg(unsigned char *inst, int reg, int cond, int is_signed)
+{
+       /* Use a SETcc instruction if we have a basic register */
+       x86_64_set_reg(inst, cond, reg, is_signed);
+       x86_64_movzx8_reg_reg_size(inst, reg, reg, 4);
+       return inst;
+}
+
+/*
+ * Helper macros for fixup handling.
+ *
+ * We have only 4 bytes for the jump offsets.
+ * Therefore we have do something tricky here.
+ * We need some fixed value that is known to be fix throughout the
+ * building of the function and that will be near the emitted code.
+ * The posn limit looks like the perfect value to use.
+ */
+#define _JIT_GET_FIXVALUE(gen) ((gen)->posn.limit)
+
+/*
+ * Calculate the fixup value
+ * This is the value stored as placeholder in the instruction.
+ */
+#define _JIT_CALC_FIXUP(fixup_list, inst) \
+       ((jit_int)((jit_nint)(inst) - (jit_nint)(fixup_list)))
+
+/*
+ * Calculate the pointer to the fixup value.
+ */
+#define _JIT_CALC_NEXT_FIXUP(fixup_list, fixup) \
+       ((fixup) ? ((jit_nint)(fixup_list) - (jit_nint)(fixup)) : (jit_nint)0)
+
+/*
+ * Get the long form of a branch opcode.
+ */
+static int
+long_form_branch(int opcode)
+{
+       if(opcode == 0xEB)
+       {
+               return 0xE9;
+       }
+       else
+       {
+               return opcode + 0x0F10;
+       }
+}
+
+/*
+ * Output a branch instruction.
+ */
+static unsigned char *
+output_branch(jit_function_t func, unsigned char *inst, int opcode,
+                         jit_insn_t insn)
+{
+       jit_block_t block;
+
+       if((insn->flags & JIT_INSN_VALUE1_IS_LABEL) != 0)
+       {
+               /* "address_of_label" instruction */
+               block = jit_block_from_label(func, (jit_label_t)(insn->value1));
+       }
+       else
+       {
+               block = jit_block_from_label(func, (jit_label_t)(insn->dest));
+       }
+       if(!block)
+       {
+               return inst;
+       }
+       if(block->address)
+       {
+               jit_nint offset;
+
+               /* We already know the address of the block */
+               offset = ((unsigned char *)(block->address)) - (inst + 2);
+               if(x86_is_imm8(offset))
+               {
+                       /* We can output a short-form backwards branch */
+                       *inst++ = (unsigned char)opcode;
+                       *inst++ = (unsigned char)offset;
+               }
+               else
+               {
+                       /* We need to output a long-form backwards branch */
+                       offset -= 3;
+                       opcode = long_form_branch(opcode);
+                       if(opcode < 256)
+                       {
+                               *inst++ = (unsigned char)opcode;
+                       }
+                       else
+                       {
+                               *inst++ = (unsigned char)(opcode >> 8);
+                               *inst++ = (unsigned char)opcode;
+                               --offset;
+                       }
+                       x86_imm_emit32(inst, offset);
+               }
+       }
+       else
+       {
+               jit_int fixup;
+
+               /* Output a placeholder and record on the block's fixup list */
+               opcode = long_form_branch(opcode);
+               if(opcode < 256)
+               {
+                       *inst++ = (unsigned char)opcode;
+               }
+               else
+               {
+                       *inst++ = (unsigned char)(opcode >> 8);
+                       *inst++ = (unsigned char)opcode;
+               }
+               if(block->fixup_list)
+               {
+                       fixup = _JIT_CALC_FIXUP(block->fixup_list, inst);
+               }
+               else
+               {
+                       fixup = 0;
+               }
+               block->fixup_list = (void *)inst;
+               x86_imm_emit32(inst, fixup);
+
+               if(DEBUG_FIXUPS)
+               {
+                       fprintf(stderr,
+                                       "Block: %lx, Current Fixup: %lx, Next 
fixup: %lx\n",
+                                       (jit_nint)block, 
(jit_nint)(block->fixup_list),
+                                       (jit_nint)fixup);
+               }
+       }
+       return inst;
+}
+
+/*
+ * Jump to the current function's epilog.
+ */
+static unsigned char *
+jump_to_epilog(jit_gencode_t gen, unsigned char *inst, jit_block_t block)
+{
+       jit_int fixup;
+
+       /* If the epilog is the next thing that we will output,
+          then fall through to the epilog directly */
+       block = block->next;
+       while(block != 0 && block->first_insn > block->last_insn)
+       {
+               block = block->next;
+       }
+       if(!block)
+       {
+               return inst;
+       }
+
+       /* Output a placeholder for the jump and add it to the fixup list */
+       *inst++ = (unsigned char)0xE9;
+       if(gen->epilog_fixup)
+       {
+               fixup = _JIT_CALC_FIXUP(gen->epilog_fixup, inst);
+       }
+       else
+       {
+               fixup = 0;
+       }
+       gen->epilog_fixup = (void *)inst;
+       x86_imm_emit32(inst, fixup);
+       return inst;
+}
+
+/*
+ * Support functiond for the FPU stack
+ */
+
+static int
+fp_stack_index(jit_gencode_t gen, int reg)
+{
+       return gen->reg_stack_top - reg - 1;
+}
+
+void
+_jit_gen_exch_top(jit_gencode_t gen, int reg)
+{
+       if(IS_FPU_REG(reg))
+       {
+               jit_cache_setup_output(2);
+               x86_fxch(inst, fp_stack_index(gen, reg));
+               jit_cache_end_output();
+       }
+}
+
+void
+ _jit_gen_move_top(jit_gencode_t gen, int reg)
+{
+       if(IS_FPU_REG(reg))
+       {
+               jit_cache_setup_output(2);
+               x86_fstp(inst, fp_stack_index(gen, reg));
+               jit_cache_end_output();
+       }
+}
+
+void
+_jit_gen_spill_top(jit_gencode_t gen, int reg, jit_value_t value, int pop)
+{
+       if(IS_FPU_REG(reg))
+       {
+               int offset;
+
+               /* Make sure that we have sufficient space */
+               jit_cache_setup_output(16);
+
+               /* Fix the value in place within the local variable frame */
+               _jit_gen_fix_value(value);
+
+               /* Output an appropriate instruction to spill the value */
+               offset = (int)(value->frame_offset);
+
+               /* Spill the top of the floating-point register stack */
+               switch(jit_type_normalize(value->type)->kind)
+               {
+                       case JIT_TYPE_FLOAT32:
+                       {
+                               if(pop)
+                               {
+                                       x86_64_fstp_membase_size(inst, 
X86_64_RBP, offset, 4);
+                               }
+                               else
+                               {
+                                       x86_64_fst_membase_size(inst, 
X86_64_RBP, offset, 4);
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT64:
+                       {
+                               if(pop)
+                               {
+                                       x86_64_fstp_membase_size(inst, 
X86_64_RBP, offset, 8);
+                               }
+                               else
+                               {
+                                       x86_64_fst_membase_size(inst, 
X86_64_RBP, offset, 8);
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_NFLOAT:
+                       {
+                               if(sizeof(jit_nfloat) == sizeof(jit_float64))
+                               {
+                                       if(pop)
+                                       {
+                                               x86_64_fstp_membase_size(inst, 
X86_64_RBP, offset, 8);
+                                       }
+                                       else
+                                       {
+                                               x86_64_fst_membase_size(inst, 
X86_64_RBP, offset, 8);
+                                       }
+                               }
+                               else
+                               {
+                                       x86_64_fstp_membase_size(inst, 
X86_64_RBP, offset, 10);
+                                       if(!pop)
+                                       {
+                                               x86_64_fld_membase_size(inst, 
X86_64_RBP, offset, 10);
+                                       }
+                               }
+                       }
+                       break;
+               }
+
+               /* End the code output process */
+               jit_cache_end_output();
+       }
+}
+
+void
+_jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t 
value)
+{
+       jit_type_t type;
+       int src_reg, other_src_reg;
+       void *ptr;
+       int offset;
+
+       /* Make sure that we have sufficient space */
+       jit_cache_setup_output(16);
+
+       type = jit_type_normalize(value->type);
+
+       /* Load zero */
+       if(value->is_constant)
+       {
+               switch(type->kind)
+               {
+                       case JIT_TYPE_SBYTE:
+                       case JIT_TYPE_UBYTE:
+                       case JIT_TYPE_SHORT:
+                       case JIT_TYPE_USHORT:
+                       case JIT_TYPE_INT:
+                       case JIT_TYPE_UINT:
+                       {
+                               if((jit_nint)(value->address) == 0)
+                               {
+                                       x86_64_clear_reg(inst, 
_jit_reg_info[reg].cpu_reg);
+                               }
+                               else
+                               {
+                                       x86_64_mov_reg_imm_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                       
(jit_nint)(value->address), 4);
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_LONG:
+                       case JIT_TYPE_ULONG:
+                       {
+                               if((jit_nint)(value->address) == 0)
+                               {
+                                       x86_64_clear_reg(inst, 
_jit_reg_info[reg].cpu_reg);
+                               }
+                               else
+                               {
+                                       x86_64_mov_reg_imm_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                       
(jit_nint)(value->address), 8);
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT32:
+                       {
+                               jit_float32 float32_value;
+
+                               float32_value = 
jit_value_get_float32_constant(value);
+
+                               if(IS_GENERAL_REG(reg))
+                               {
+                                       union
+                                       {
+                                               jit_float32 float32_value;
+                                               jit_int int_value;
+                                       } un;
+
+                                       un.float32_value = float32_value;
+                                       x86_64_mov_reg_imm_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
        un.int_value, 4);
+                               }
+                               else if(IS_XMM_REG(reg))
+                               {
+                                       int xmm_reg = 
_jit_reg_info[reg].cpu_reg;
+
+                                       _jit_xmm1_reg_imm_size_float32(gen, 
&inst, XMM1_MOV,
+                                                                               
                   xmm_reg, &float32_value);
+                               }
+                               else
+                               {
+                                       if(float32_value == (jit_float32) 0.0)
+                                       {
+                                               x86_fldz(inst);
+                                       }
+                                       else if(float32_value == (jit_float32) 
1.0)
+                                       {
+                                               x86_fld1(inst);
+                                       }
+                                       else
+                                       {
+                                               jit_nint offset;
+
+                                               ptr = 
_jit_cache_alloc(&(gen->posn), sizeof(jit_float32));
+                                               jit_memcpy(ptr, &float32_value, 
sizeof(float32_value));
+
+                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 7);
+                                               if((offset >= jit_min_int) && 
(offset <= jit_max_int))
+                                               {
+                                                       /* We can use RIP 
relative addressing here */
+                                                       
x86_64_fld_membase_size(inst, X86_64_RIP, offset, 4);
+                                               }
+                                               else if(((jit_nint)ptr >= 
jit_min_int) &&
+                                                               ((jit_nint)ptr 
<= jit_max_int))
+                                               {
+                                                       /* We can use absolute 
addressing */
+                                                       
x86_64_fld_mem_size(inst, (jit_nint)ptr, 4);
+                                               }
+                                               else
+                                               {
+                                                       /* We have to use an 
extra general register */
+                                                       /* TODO */
+                                               }
+                                       }
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT64:
+                       {
+                               jit_float64 float64_value;
+                               float64_value = 
jit_value_get_float64_constant(value);
+                               if(IS_GENERAL_REG(reg))
+                               {
+                                       union
+                                       {
+                                               jit_float64 float64_value;
+                                               jit_long long_value;
+                                       } un;
+
+                                       un.float64_value = float64_value;
+                                       x86_64_mov_reg_imm_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
        un.long_value, 8);
+                               }
+                               else if(IS_XMM_REG(reg))
+                               {
+                                       int xmm_reg = 
_jit_reg_info[reg].cpu_reg;
+
+                                       _jit_xmm1_reg_imm_size_float64(gen, 
&inst, XMM1_MOV,
+                                                                               
                   xmm_reg, &float64_value);
+                               }
+                               else
+                               {
+                                       if(float64_value == (jit_float64) 0.0)
+                                       {
+                                               x86_fldz(inst);
+                                       }
+                                       else if(float64_value == (jit_float64) 
1.0)
+                                       {
+                                               x86_fld1(inst);
+                                       }
+                                       else
+                                       {
+                                               jit_nint offset;
+
+                                               ptr = 
_jit_cache_alloc(&(gen->posn), sizeof(jit_float64));
+                                               jit_memcpy(ptr, &float64_value, 
sizeof(float64_value));
+
+                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 7);
+                                               if((offset >= jit_min_int) && 
(offset <= jit_max_int))
+                                               {
+                                                       /* We can use RIP 
relative addressing here */
+                                                       
x86_64_fld_membase_size(inst, X86_64_RIP, offset, 8);
+                                               }
+                                               else if(((jit_nint)ptr >= 
jit_min_int) &&
+                                                               ((jit_nint)ptr 
<= jit_max_int))
+                                               {
+                                                       /* We can use absolute 
addressing */
+                                                       
x86_64_fld_mem_size(inst, (jit_nint)ptr, 8);
+                                               }
+                                               else
+                                               {
+                                                       /* We have to use an 
extra general register */
+                                                       /* TODO */
+                                               }
+                                       }
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_NFLOAT:
+                       {
+                               jit_nfloat nfloat_value;
+                               nfloat_value = 
jit_value_get_nfloat_constant(value);
+                               if(IS_GENERAL_REG(reg) && sizeof(jit_nfloat) == 
sizeof(jit_float64))
+                               {
+                                       union
+                                       {
+                                               jit_nfloat nfloat_value;
+                                               jit_long long_value;
+                                       } un;
+
+                                       un.nfloat_value = nfloat_value;
+                                       x86_64_mov_reg_imm_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
        un.long_value, 8);
+                               }
+                               else if(IS_XMM_REG(reg) && sizeof(jit_nfloat) 
== sizeof(jit_float64))
+                               {
+                                       jit_nint offset;
+                                       int xmm_reg = 
_jit_reg_info[reg].cpu_reg;
+
+                                       ptr = _jit_cache_alloc(&(gen->posn), 
sizeof(jit_nfloat));
+                                       jit_memcpy(ptr, &nfloat_value, 
sizeof(nfloat_value));
+                                       offset = (jit_nint)ptr - 
+                                                               ((jit_nint)inst 
+ (xmm_reg > 7 ? 9 : 8));
+                                       if((offset >= jit_min_int) && (offset 
<= jit_max_int))
+                                       {
+                                               /* We can use RIP relative 
addressing here */
+                                               x86_64_movsd_reg_membase(inst, 
xmm_reg, X86_64_RIP, offset);
+                                       }
+                                       else if(((jit_nint)ptr >= jit_min_int) 
&&
+                                                       ((jit_nint)ptr <= 
jit_max_int))
+                                       {
+                                               /* We can use absolute 
addressing */
+                                               x86_64_movsd_reg_mem(inst, 
xmm_reg, (jit_nint)ptr);
+                                       }
+                                       else
+                                       {
+                                               /* We have to use an extra 
general register */
+                                               /* TODO */
+                                       }
+                               }
+                               else
+                               {
+                                       if(nfloat_value == (jit_nfloat) 0.0)
+                                       {
+                                               x86_fldz(inst);
+                                       }
+                                       else if(nfloat_value == (jit_nfloat) 
1.0)
+                                       {
+                                               x86_fld1(inst);
+                                       }
+                                       else
+                                       {
+                                               jit_nint offset;
+
+                                               ptr = 
_jit_cache_alloc(&(gen->posn), sizeof(jit_nfloat));
+                                               jit_memcpy(ptr, &nfloat_value, 
sizeof(nfloat_value));
+
+                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 7);
+                                               if((offset >= jit_min_int) && 
(offset <= jit_max_int))
+                                               {
+                                                       /* We can use RIP 
relative addressing here */
+                                                       if(sizeof(jit_nfloat) 
== sizeof(jit_float64))
+                                                       {
+                                                               
x86_64_fld_membase_size(inst, X86_64_RIP, offset, 8);
+                                                       }
+                                                       else
+                                                       {
+                                                               
x86_64_fld_membase_size(inst, X86_64_RIP, offset, 10);
+                                                       }
+                                               }
+                                               else if(((jit_nint)ptr >= 
jit_min_int) &&
+                                                               ((jit_nint)ptr 
<= jit_max_int))
+                                               {
+                                                       /* We can use absolute 
addressing */
+                                                       if(sizeof(jit_nfloat) 
== sizeof(jit_float64))
+                                                       {
+                                                               
x86_64_fld_mem_size(inst, (jit_nint)ptr, 8);
+                                                       }
+                                                       else
+                                                       {
+                                                               
x86_64_fld_mem_size(inst, (jit_nint)ptr, 10);
+                                                       }
+                                               }
+                                               else
+                                               {
+                                                       /* We have to use an 
extra general register */
+                                                       /* TODO */
+                                               }
+                                       }
+                               }
+                       }
+                       break;
+               }
+       }
+       else if(value->in_register || value->in_global_register)
+       {
+               if(value->in_register)
+               {
+                       src_reg = value->reg;
+                       other_src_reg = -1;
+               }
+               else
+               {
+                       src_reg = value->global_reg;
+                       other_src_reg = -1;
+               }
+
+               switch(type->kind)
+               {
+#if 0
+                       case JIT_TYPE_SBYTE:
+                       {
+                               x86_widen_reg(inst, _jit_reg_info[reg].cpu_reg,
+                                             _jit_reg_info[src_reg].cpu_reg, 
1, 0);
+                       }
+                       break;
+
+                       case JIT_TYPE_UBYTE:
+                       {
+                               x86_widen_reg(inst, _jit_reg_info[reg].cpu_reg,
+                                             _jit_reg_info[src_reg].cpu_reg, 
0, 0);
+                       }
+                       break;
+
+                       case JIT_TYPE_SHORT:
+                       {
+                               x86_widen_reg(inst, _jit_reg_info[reg].cpu_reg,
+                                             _jit_reg_info[src_reg].cpu_reg, 
1, 1);
+                       }
+                       break;
+
+                       case JIT_TYPE_USHORT:
+                       {
+                               x86_widen_reg(inst, _jit_reg_info[reg].cpu_reg,
+                                             _jit_reg_info[src_reg].cpu_reg, 
0, 1);
+                       }
+                       break;
+#else
+                       case JIT_TYPE_SBYTE:
+                       case JIT_TYPE_UBYTE:
+                       case JIT_TYPE_SHORT:
+                       case JIT_TYPE_USHORT:
+#endif
+                       case JIT_TYPE_INT:
+                       case JIT_TYPE_UINT:
+                       {
+                               x86_64_mov_reg_reg_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
_jit_reg_info[src_reg].cpu_reg, 4);
+                       }
+                       break;
+
+                       case JIT_TYPE_LONG:
+                       case JIT_TYPE_ULONG:
+                       {
+                               x86_64_mov_reg_reg_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
_jit_reg_info[src_reg].cpu_reg, 8);
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT32:
+                       {
+                               if(IS_FPU_REG(reg))
+                               {
+                                       if(IS_FPU_REG(src_reg))
+                                       {
+                                               x86_fld_reg(inst, 
fp_stack_index(gen, src_reg));
+                                       }
+                                       else if(IS_XMM_REG(src_reg))
+                                       {
+                                               /* Fix the position of the 
value in the stack frame */
+                                               _jit_gen_fix_value(value);
+                                               offset = 
(int)(value->frame_offset);
+
+                                               x86_64_movss_membase_reg(inst, 
X86_64_RBP, offset,
+                                                                               
                 _jit_reg_info[src_reg].cpu_reg);
+                                               x86_64_fld_membase_size(inst, 
X86_64_RBP, offset, 4);
+                                       }
+                               }
+                               else if(IS_XMM_REG(reg))
+                               {
+                                       if(IS_FPU_REG(src_reg))
+                                       {
+                                               /* Fix the position of the 
value in the stack frame */
+                                               _jit_gen_fix_value(value);
+                                               offset = 
(int)(value->frame_offset);
+
+                                               x86_64_fst_membase_size(inst, 
X86_64_RBP, offset, 4);
+                                               x86_64_movss_reg_membase(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                 X86_64_RBP, offset);
+                                       }
+                                       else if(IS_XMM_REG(src_reg))
+                                       {
+                                               x86_64_movss_reg_reg(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
         _jit_reg_info[src_reg].cpu_reg);
+                                       }
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT64:
+                       {
+                               if(IS_FPU_REG(reg))
+                               {
+                                       if(IS_FPU_REG(src_reg))
+                                       {
+                                               x86_fld_reg(inst, 
fp_stack_index(gen, src_reg));
+                                       }
+                                       else if(IS_XMM_REG(src_reg))
+                                       {
+                                               /* Fix the position of the 
value in the stack frame */
+                                               _jit_gen_fix_value(value);
+                                               offset = 
(int)(value->frame_offset);
+
+                                               x86_64_movsd_membase_reg(inst, 
X86_64_RBP, offset,
+                                                                               
                 _jit_reg_info[src_reg].cpu_reg);
+                                               x86_64_fld_membase_size(inst, 
X86_64_RBP, offset, 8);
+                                       }
+                               }
+                               else if(IS_XMM_REG(reg))
+                               {
+                                       if(IS_FPU_REG(src_reg))
+                                       {
+                                               /* Fix the position of the 
value in the stack frame */
+                                               _jit_gen_fix_value(value);
+                                               offset = 
(int)(value->frame_offset);
+
+                                               x86_64_fst_membase_size(inst, 
X86_64_RBP, offset, 8);
+                                               x86_64_movsd_reg_membase(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                 X86_64_RBP, offset);
+                                       }
+                                       else if(IS_XMM_REG(src_reg))
+                                       {
+                                               x86_64_movsd_reg_reg(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
         _jit_reg_info[src_reg].cpu_reg);
+                                       }
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_NFLOAT:
+                       {
+                               if(IS_FPU_REG(reg))
+                               {
+                                       if(IS_FPU_REG(src_reg))
+                                       {
+                                               x86_fld_reg(inst, 
fp_stack_index(gen, src_reg));
+                                       }
+                                       else
+                                       {
+                                               fputs("Unsupported native float 
reg - reg move\n", stderr);
+                                       }
+                               }
+                       }
+                       break;
+               }
+       }
+       else
+       {
+               /* Fix the position of the value in the stack frame */
+               _jit_gen_fix_value(value);
+               offset = (int)(value->frame_offset);
+
+               /* Load the value into the specified register */
+               switch(type->kind)
+               {
+                       case JIT_TYPE_SBYTE:
+                       {
+                               x86_64_movsx8_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
           X86_64_RBP, offset, 4);
+                       }
+                       break;
+
+                       case JIT_TYPE_UBYTE:
+                       {
+                               x86_64_movzx8_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
           X86_64_RBP, offset, 4);
+                       }
+                       break;
+
+                       case JIT_TYPE_SHORT:
+                       {
+                               x86_64_movsx16_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                X86_64_RBP, offset, 4);
+                       }
+                       break;
+
+                       case JIT_TYPE_USHORT:
+                       {
+                               x86_64_movzx16_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                X86_64_RBP, offset, 4);
+                       }
+                       break;
+
+                       case JIT_TYPE_INT:
+                       case JIT_TYPE_UINT:
+                       {
+                               x86_64_mov_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
        X86_64_RBP, offset, 4);
+                       }
+                       break;
+
+                       case JIT_TYPE_LONG:
+                       case JIT_TYPE_ULONG:
+                       {
+                               x86_64_mov_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
        X86_64_RBP, offset, 8);
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT32:
+                       {
+                               if(IS_GENERAL_REG(reg))
+                               {
+                                       x86_64_mov_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                X86_64_RBP, offset, 4);
+                               }
+                               if(IS_XMM_REG(reg))
+                               {
+                                       x86_64_movss_reg_membase(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
         X86_64_RBP, offset);
+                               }
+                               else
+                               {
+                                       x86_64_fld_membase_size(inst, 
X86_64_RBP, offset, 4);
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_FLOAT64:
+                       {
+                               if(IS_GENERAL_REG(reg))
+                               {
+                                       x86_64_mov_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                X86_64_RBP, offset, 8);
+                               }
+                               else if(IS_XMM_REG(reg))
+                               {
+                                       x86_64_movsd_reg_membase(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
         X86_64_RBP, offset);
+                               }
+                               else
+                               {
+                                       x86_64_fld_membase_size(inst, 
X86_64_RBP, offset, 8);
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_NFLOAT:
+                       {
+                               if(sizeof(jit_nfloat) == sizeof(jit_float64))
+                               {
+                                       if(IS_GENERAL_REG(reg))
+                                       {
+                                               
x86_64_mov_reg_membase_size(inst, _jit_reg_info[reg].cpu_reg,
+                                                                               
                        X86_64_RBP, offset, 8);
+                                       }
+                                       else if(IS_XMM_REG(reg))
+                                       {
+                                               x86_64_movsd_reg_membase(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                 X86_64_RBP, offset);
+                                       }
+                                       else
+                                       {
+                                               x86_64_fld_membase_size(inst, 
X86_64_RBP, offset, 8);
+                                       }
+                               }
+                               else
+                               {
+                                       x86_64_fld_membase_size(inst, 
X86_64_RBP, offset, 10);
+                               }
+                       }
+                       break;
+
+                       case JIT_TYPE_STRUCT:
+                       case JIT_TYPE_UNION:
+                       {
+                               jit_nuint size = jit_type_get_size(type);
+
+                               if(IS_GENERAL_REG(reg))
+                               {
+                                       if(size == 1)
+                                       {
+                                               
x86_64_mov_reg_membase_size(inst, _jit_reg_info[reg].cpu_reg,
+                                                                               
                        X86_64_RBP, offset, 1);
+                                       }
+                                       else if(size == 2)
+                                       {
+                                               
x86_64_mov_reg_membase_size(inst, _jit_reg_info[reg].cpu_reg,
+                                                                               
                        X86_64_RBP, offset, 2);
+                                       }
+                                       else if(size <= 4)
+                                       {
+                                               
x86_64_mov_reg_membase_size(inst, _jit_reg_info[reg].cpu_reg,
+                                                                               
                        X86_64_RBP, offset, 4);
+                                       }
+                                       else if(size <= 8)
+                                       {
+                                               
x86_64_mov_reg_membase_size(inst, _jit_reg_info[reg].cpu_reg,
+                                                                               
                        X86_64_RBP, offset, 8);
+                                       }
+                               }
+                               else if(IS_XMM_REG(reg))
+                               {
+                                       if(size <= 4)
+                                       {
+                                               x86_64_movss_reg_membase(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                 X86_64_RBP, offset);
+                                       }
+                                       else if(size <= 8)
+                                       {
+                                               x86_64_movsd_reg_membase(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
                 X86_64_RBP, offset);
+                                       }
+                                       else
+                                       {
+                                               int alignment = 
jit_type_get_alignment(type);
+
+                                               if((alignment & 0xf) == 0)
+                                               {
+                                                       
x86_64_movaps_reg_membase(inst, _jit_reg_info[reg].cpu_reg,
+                                                                               
                          X86_64_RBP, offset);
+                                               }
+                                               else
+                                               {
+                                                       
x86_64_movups_reg_membase(inst, _jit_reg_info[reg].cpu_reg,
+                                                                               
                          X86_64_RBP, offset);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       /* End the code output process */
+       jit_cache_end_output();
+}
+
+void
+_jit_gen_get_elf_info(jit_elf_info_t *info)
+{
+       info->machine = 62;     /* EM_X86_64 */
+       info->abi = 0;          /* ELFOSABI_SYSV */
+       info->abi_version = 0;
+}
+
+void *
+_jit_gen_prolog(jit_gencode_t gen, jit_function_t func, void *buf)
+{
+       unsigned char prolog[JIT_PROLOG_SIZE];
+       unsigned char *inst = prolog;
+       int reg;
+       int frame_size = 0;
+       int regs_to_save = 0;
+
+       /* Push ebp onto the stack */
+       x86_64_push_reg_size(inst, X86_64_RBP, 8);
+
+       /* Initialize EBP for the current frame */
+       x86_64_mov_reg_reg_size(inst, X86_64_RBP, X86_64_RSP, 8);
+
+       /* Allocate space for the local variable frame */
+       if(func->builder->frame_size > 0)
+       {
+               /* Make sure that the framesize is a multiple of 8 bytes */
+               frame_size = (func->builder->frame_size + 0x7) & ~0x7;
+       }
+
+       /* Get the number of registers we need to preserve */
+       for(reg = 0; reg < 14; ++reg)
+       {
+               if(jit_reg_is_used(gen->touched, reg) &&
+                  (_jit_reg_info[reg].flags & JIT_REG_CALL_USED) == 0)
+               {
+                       ++regs_to_save;
+               }
+       }
+
+       /* add the register save area to the initial frame size */
+       frame_size += (regs_to_save << 3);
+
+       /* Make sure that the framesize is a multiple of 16 bytes */
+       /* so that the final RSP will be alligned on a 16byte boundary. */
+       frame_size = (frame_size + 0xf) & ~0xf;
+
+       if(frame_size > 0)
+       {
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, frame_size, 8);
+       }
+
+       if(regs_to_save > 0)
+       {
+               int current_offset = 0;
+
+               /* Save registers that we need to preserve */
+               for(reg = 0; reg <= 14; ++reg)
+               {
+                       if(jit_reg_is_used(gen->touched, reg) &&
+                          (_jit_reg_info[reg].flags & JIT_REG_CALL_USED) == 0)
+                       {
+                               x86_64_mov_membase_reg_size(inst, X86_64_RSP, 
current_offset,
+                                                                               
        _jit_reg_info[reg].cpu_reg, 8);
+                               current_offset += 8;
+                       }
+               }
+       }
+
+       /* Copy the prolog into place and return the adjusted entry position */
+       reg = (int)(inst - prolog);
+       jit_memcpy(((unsigned char *)buf) + JIT_PROLOG_SIZE - reg, prolog, reg);
+       return (void *)(((unsigned char *)buf) + JIT_PROLOG_SIZE - reg);
+}
+
+void
+_jit_gen_epilog(jit_gencode_t gen, jit_function_t func)
+{
+       unsigned char *inst;
+       int reg;
+       int current_offset;
+       jit_int *fixup;
+       jit_int *next;
+
+       /* Bail out if there is insufficient space for the epilog */
+       if(!jit_cache_check_for_n(&(gen->posn), 48))
+       {
+               jit_cache_mark_full(&(gen->posn));
+               return;
+       }
+
+       inst = gen->posn.ptr;
+
+       /* Perform fixups on any blocks that jump to the epilog */
+       fixup = (jit_int *)(gen->epilog_fixup);
+       while(fixup != 0)
+       {
+               if(DEBUG_FIXUPS)
+               {
+                       fprintf(stderr, "Fixup Address: %lx, Value: %x\n",
+                                       (jit_nint)fixup, fixup[0]);
+               }
+               next = (jit_int *)_JIT_CALC_NEXT_FIXUP(fixup, fixup[0]);
+               fixup[0] = (jit_int)(((jit_nint)inst) - ((jit_nint)fixup) - 4);
+               fixup = next;
+       }
+       gen->epilog_fixup = 0;
+
+       /* Restore the used callee saved registers */
+       if(gen->stack_changed)
+       {
+               int frame_size = func->builder->frame_size;
+               int regs_saved = 0;
+
+               /* Get the number of registers we preserves */
+               for(reg = 0; reg < 14; ++reg)
+               {
+                       if(jit_reg_is_used(gen->touched, reg) &&
+                          (_jit_reg_info[reg].flags & JIT_REG_CALL_USED) == 0)
+                       {
+                               ++regs_saved;
+                       }
+               }
+
+               /* add the register save area to the initial frame size */
+               frame_size += (regs_saved << 3);
+
+               /* Make sure that the framesize is a multiple of 16 bytes */
+               /* so that the final RSP will be alligned on a 16byte boundary. 
*/
+               frame_size = (frame_size + 0xf) & ~0xf;
+
+               current_offset = -frame_size;
+
+               for(reg = 0; reg <= 14; ++reg)
+               {
+                       if(jit_reg_is_used(gen->touched, reg) &&
+                          (_jit_reg_info[reg].flags & JIT_REG_CALL_USED) == 0)
+                       {
+                               x86_64_mov_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
        X86_64_RBP, current_offset, 8);
+                               current_offset += 8;
+                       }
+               }
+       }
+       else
+       {
+               current_offset = 0;
+               for(reg = 0; reg <= 14; ++reg)
+               {
+                       if(jit_reg_is_used(gen->touched, reg) &&
+                          (_jit_reg_info[reg].flags & JIT_REG_CALL_USED) == 0)
+                       {
+                               x86_64_mov_reg_membase_size(inst, 
_jit_reg_info[reg].cpu_reg,
+                                                                               
        X86_64_RSP, current_offset, 8);
+                               current_offset += 8;
+                       }
+               }
+       }
+
+       /* Restore stackpointer and frame register */
+       x86_64_mov_reg_reg_size(inst, X86_64_RSP, X86_64_RBP, 8);
+       x86_64_pop_reg_size(inst, X86_64_RBP, 8);
+
+       /* and return */
+       x86_64_ret(inst);
+
+       gen->posn.ptr = inst;
+}
+
+/*
+ * Copy a small block. This code will be inlined.
+ * Set is_aligned to 0 if you don't know if the source and target locations
+ * are aligned on a 16byte boundary and != 0 if you know that both blocks are
+ * aligned.
+ * We assume that offset + size is in the range -2GB ... +2GB.
+ */
+static unsigned char *
+small_block_copy(jit_gencode_t gen, unsigned char *inst,
+                                int dreg, jit_nint doffset,
+                                int sreg, jit_nint soffset, jit_int size,
+                                int scratch_reg, int scratch_xreg, int 
is_aligned)
+{
+       int offset = 0;
+
+       while(size >= 16)
+       {
+               if(is_aligned)
+               {
+                       x86_64_movaps_reg_membase(inst, scratch_xreg,
+                                                                         sreg, 
soffset + offset);
+                       x86_64_movaps_membase_reg(inst, dreg, doffset + offset,
+                                                                         
scratch_xreg);
+               }
+               else
+               {
+                       x86_64_movups_reg_membase(inst, scratch_xreg,
+                                                                         sreg, 
soffset + offset);
+                       x86_64_movups_membase_reg(inst, dreg, doffset + offset,
+                                                                         
scratch_xreg);
+               }
+               size -= 16;
+               offset += 16;
+       }
+       /* Now copy the rest of the struct */
+       if(size >= 8)
+       {
+               x86_64_mov_reg_membase_size(inst, scratch_reg,
+                                                                       sreg, 
soffset + offset, 8);
+               x86_64_mov_membase_reg_size(inst, dreg, doffset + offset,
+                                                                       
scratch_reg, 8);
+               size -= 8;
+               offset += 8;
+       }
+       if(size >= 4)
+       {
+               x86_64_mov_reg_membase_size(inst, scratch_reg,
+                                                                       sreg, 
soffset + offset, 4);
+               x86_64_mov_membase_reg_size(inst, dreg, doffset + offset,
+                                                                       
scratch_reg, 4);
+               size -= 4;
+               offset += 4;
+       }
+       if(size >= 2)
+       {
+               x86_64_mov_reg_membase_size(inst, scratch_reg,
+                                                                       sreg, 
soffset + offset, 2);
+               x86_64_mov_membase_reg_size(inst, dreg, doffset + offset,
+                                                                       
scratch_reg, 2);
+               size -= 2;
+               offset += 2;
+       }
+       if(size >= 1)
+       {
+               x86_64_mov_reg_membase_size(inst, scratch_reg,
+                                                                       sreg, 
soffset + offset, 1);
+               x86_64_mov_membase_reg_size(inst, dreg, doffset + offset,
+                                                                       
scratch_reg, 1);
+               size -= 1;
+               offset += 1;
+       }
+       return inst;
+}
+
+/*
+ * Copy a struct.
+ * The size of the type must be <= 4 * 16bytes
+ */
+static unsigned char *
+small_struct_copy(jit_gencode_t gen, unsigned char *inst,
+                                 int dreg, jit_nint doffset,
+                                 int sreg, jit_nint soffset, jit_type_t type,
+                                 int scratch_reg, int scratch_xreg)
+{
+       int size = jit_type_get_size(type);
+       int alignment = jit_type_get_alignment(type);
+
+       return small_block_copy(gen, inst, dreg, doffset,
+                                                       sreg, soffset, size, 
scratch_reg,
+                                                       scratch_xreg, 
((alignment & 0xf) == 0));
+}
+
+/*
+ * Copy a block of memory that has a specific size. All call clobbered
+ * registers must be unused at this point.
+ */
+static unsigned char *
+memory_copy(jit_gencode_t gen, unsigned char *inst,
+                       int dreg, jit_nint doffset,
+                       int sreg, jit_nint soffset, jit_nint size)
+{
+       if(dreg == X86_64_RDI)
+       {
+               if(sreg != X86_64_RSI)
+               {
+                       x86_64_mov_reg_reg_size(inst, X86_64_RSI, sreg, 8);
+               }
+       }
+       else if(dreg == X86_64_RSI)
+       {
+               if(sreg == X86_64_RDI)
+               {
+                       /* The registers are swapped so we need a temporary 
register */
+                       x86_64_mov_reg_reg_size(inst, X86_64_RCX, X86_64_RSI, 
8);
+                       x86_64_mov_reg_reg_size(inst, X86_64_RSI, X86_64_RDI, 
8);
+                       x86_64_mov_reg_reg_size(inst, X86_64_RDI, X86_64_RCX, 
8);
+               }
+               else
+               {
+                       x86_64_mov_reg_reg_size(inst, X86_64_RDI, X86_64_RSI, 
8);
+                       if(sreg != X86_64_RSI)
+                       {
+                               x86_64_mov_reg_reg_size(inst, X86_64_RSI, sreg, 
8);
+                       }
+               }
+       }
+       else
+       {
+               x86_64_mov_reg_reg_size(inst, X86_64_RSI, sreg, 8);
+               x86_64_mov_reg_reg_size(inst, X86_64_RDI, dreg, 8);
+       }
+       /* Move the size to argument register 3 now */
+       if((size > 0) && (size <= jit_max_uint))
+       {
+               x86_64_mov_reg_imm_size(inst, X86_64_RDX, size, 4);
+       }
+       else
+       {
+               x86_64_mov_reg_imm_size(inst, X86_64_RDX, size, 8);
+       }
+       if(soffset != 0)
+       {
+               x86_64_add_reg_imm_size(inst, X86_64_RSI, soffset, 8);
+       }
+       if(doffset != 0)
+       {
+               x86_64_add_reg_imm_size(inst, X86_64_RDI, doffset, 8);
+       }
+       inst = x86_64_call_code(inst, (jit_nint)jit_memcpy);
+       return inst;
+}
+
+void
+_jit_gen_start_block(jit_gencode_t gen, jit_block_t block)
+{
+       jit_int *fixup;
+       jit_int *next;
+       void **absolute_fixup;
+       void **absolute_next;
+
+       /* Set the address of this block */
+       block->address = (void *)(gen->posn.ptr);
+
+       /* If this block has pending fixups, then apply them now */
+       fixup = (jit_int *)(block->fixup_list);
+       if(DEBUG_FIXUPS && fixup)
+       {
+               fprintf(stderr, "Block: %lx\n", (jit_nint)block);
+               fprintf(stderr, "Limit: %lx\n", 
(jit_nint)_JIT_GET_FIXVALUE(gen));
+       }
+       while(fixup != 0)
+       {
+               if(DEBUG_FIXUPS)
+               {
+                       fprintf(stderr, "Fixup Address: %lx, Value: %x\n",
+                                       (jit_nint)fixup, fixup[0]);
+               }
+               next = (jit_int *)_JIT_CALC_NEXT_FIXUP(fixup, fixup[0]);
+               fixup[0] = (jit_int)
+                       (((jit_nint)(block->address)) - ((jit_nint)fixup) - 4);
+               fixup = next;
+       }
+       block->fixup_list = 0;
+
+       /* Absolute fixups contain complete pointers */
+       absolute_fixup = (void**)(block->fixup_absolute_list);
+       while(absolute_fixup != 0)
+       {
+               absolute_next = (void **)(absolute_fixup[0]);
+               absolute_fixup[0] = (void *)((jit_nint)(block->address));
+               absolute_fixup = absolute_next;
+       }
+       block->fixup_absolute_list = 0;
+}
+
+void
+_jit_gen_end_block(jit_gencode_t gen, jit_block_t block)
+{
+       /* Nothing to do here for x86 */
+}
+
+int
+_jit_gen_is_global_candidate(jit_type_t type)
+{
+       switch(jit_type_remove_tags(type)->kind)
+       {
+               case JIT_TYPE_INT:
+               case JIT_TYPE_UINT:
+               case JIT_TYPE_LONG:
+               case JIT_TYPE_ULONG:
+               case JIT_TYPE_NINT:
+               case JIT_TYPE_NUINT:
+               case JIT_TYPE_PTR:
+               case JIT_TYPE_SIGNATURE:
+               {
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+/*
+ * Do the stuff usually handled in jit-rules.c for native implementations
+ * here too because the common implementation is not enough for x86_64.
+ */
+
+/*
+ * Flag that a parameter is passed on the stack.
+ */
+#define JIT_ARG_CLASS_STACK    0xFFFF
+
+/*
+ * Define the way the parameter is passed to a specific function
+ */
+typedef struct
+{
+       jit_value_t value;
+       jit_ushort arg_class;
+       jit_ushort stack_pad;           /* Number of stack words needed for 
padding */
+       union
+       {
+               unsigned char reg[4];
+               jit_int offset;
+       } un;
+} _jit_param_t;
+
+/*
+ * Structure that is used to help with parameter passing.
+ */
+typedef struct
+{
+       int                             stack_size;                     /* 
Number of bytes needed on the */
+                                                                               
/* stack for parameter passing */
+       int                             stack_pad;                      /* 
Number of stack words we have */
+                                                                               
/* to push before pushing the */
+                                                                               
/* parameters for keeping the stack */
+                                                                               
/* aligned */
+       unsigned int    word_index;                     /* Number of word 
registers */
+                                                                               
/* allocated */
+       unsigned int    max_word_regs;          /* Number of word registers */
+                                                                               
/* available for parameter passing */
+       const int          *word_regs;
+       unsigned int    float_index;
+       unsigned int    max_float_regs;
+       const int          *float_regs;
+       _jit_param_t   *params;
+
+} jit_param_passing_t;
+
+/*
+ * Allcate the slot for a parameter passed on the stack.
+ */
+static void
+_jit_alloc_param_slot(jit_param_passing_t *passing, _jit_param_t *param,
+                                         jit_type_t type)
+{
+       jit_int size = jit_type_get_size(type);
+       jit_int alignment = jit_type_get_alignment(type);
+
+       /* Expand the size to a multiple of the stack slot size */
+       size = ROUND_STACK(size);
+
+       /* Expand the alignment to a multiple of the stack slot size */
+       /* We expect the alignment to be a power of two after this step */
+       alignment = ROUND_STACK(alignment);
+
+       /* Make sure the current offset is aligned propperly for the type */
+       if((passing->stack_size & (alignment -1)) != 0)
+       {
+               /* We need padding on the stack to fix the alignment constraint 
*/
+               jit_int padding = passing->stack_size & (alignment -1);
+
+               /* Add the padding to the stack region */
+               passing->stack_size += padding;
+
+               /* record the number of pad words needed after pushing this arg 
*/
+               param->stack_pad = STACK_SLOTS_USED(padding);
+       }
+       /* Record the offset of the parameter in the arg region. */
+       param->un.offset = passing->stack_size;
+
+       /* And increase the argument region used. */
+       passing->stack_size += size;
+}
+
+/*
+ * Determine if a type corresponds to a structure or union.
+ */
+static int
+is_struct_or_union(jit_type_t type)
+{
+       type = jit_type_normalize(type);
+       if(type)
+       {
+               if(type->kind == JIT_TYPE_STRUCT || type->kind == 
JIT_TYPE_UNION)
+               {
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+/*
+ * Classify the argument type.
+ * The type has to be in it's normalized form.
+ */
+static int
+_jit_classify_arg(jit_type_t arg_type, int is_return)
+{
+       switch(arg_type->kind)
+       {
+               case JIT_TYPE_SBYTE:
+               case JIT_TYPE_UBYTE:
+               case JIT_TYPE_SHORT:
+               case JIT_TYPE_USHORT:
+               case JIT_TYPE_INT:
+               case JIT_TYPE_UINT:
+               case JIT_TYPE_NINT:
+               case JIT_TYPE_NUINT:
+               case JIT_TYPE_LONG:
+               case JIT_TYPE_ULONG:
+               case JIT_TYPE_SIGNATURE:
+               case JIT_TYPE_PTR:
+               {
+                       return X86_64_ARG_INTEGER;
+               }
+               break;
+
+               case JIT_TYPE_FLOAT32:
+               case JIT_TYPE_FLOAT64:
+               {
+                       return X86_64_ARG_SSE;
+               }
+               break;
+
+               case JIT_TYPE_NFLOAT:
+               {
+                       /* we assume the nfloat type to be long double (80bit) 
*/
+                       if(is_return)
+                       {
+                               return X86_64_ARG_X87;
+                       }
+                       else
+                       {
+                               return X86_64_ARG_MEMORY;
+                       }
+               }
+               break;
+
+               case JIT_TYPE_STRUCT:
+               case JIT_TYPE_UNION:
+               {
+                       int size = jit_type_get_size(arg_type);
+
+                       if(size > 16)
+                       {
+                               return X86_64_ARG_MEMORY;
+                       }
+                       else if(size <= 8)
+                       {
+                               return X86_64_ARG_INTEGER;
+                       }
+                       /* For structs and unions with sizes between 8 ant 16 
bytes */
+                       /* we have to look at the elements. */
+                       /* TODO */
+               }
+       }
+       return X86_64_ARG_NO_CLASS;
+}
+
+/*
+ * On X86_64 the alignment of native types matches their size.
+ * This leads to the result that all types except nfloats and aggregates
+ * (structs and unions) must start and end in an eightbyte (or the part
+ * we are looking at).
+ */
+static int
+_jit_classify_structpart(jit_type_t struct_type, unsigned int start,
+                                                unsigned int start_offset, 
unsigned int end_offset)
+{
+       int arg_class = X86_64_ARG_NO_CLASS;
+       unsigned int num_fields = jit_type_num_fields(struct_type);
+       unsigned int current_field;
+       
+       for(current_field = 0; current_field < num_fields; ++current_field)
+       {
+               jit_nuint field_offset = jit_type_get_offset(struct_type,
+                                                                               
                         current_field);
+
+               if(field_offset <= end_offset)
+               {
+                       /* The field starts at a place that's inerresting for 
us */
+                       jit_type_t field_type = jit_type_get_field(struct_type,
+                                                                               
                           current_field);
+                       jit_nuint field_size = jit_type_get_size(field_type); 
+
+                       if(field_offset + field_size > start_offset)
+                       {
+                               /* The field is at least partially in the part 
we are */
+                               /* looking at */
+                               int arg_class2 = X86_64_ARG_NO_CLASS;
+
+                               if(is_struct_or_union(field_type))
+                               {
+                                       /* We have to check this struct 
recursively */
+                                       unsigned int current_start;
+                                       unsigned int nested_struct_start;
+                                       unsigned int nested_struct_end;
+
+                                       current_start = start + start_offset;
+                                       if(field_offset < current_start)
+                                       {
+                                               nested_struct_start = 
current_start - field_offset;
+                                       }
+                                       else
+                                       {
+                                               nested_struct_start = 0;
+                                       }
+                                       if(field_offset + field_size - 1 > 
end_offset)
+                                       {
+                                               /* The struct ends beyond the 
part we are looking at */
+                                               nested_struct_end = 
field_offset + field_size -
+                                                                               
                (nested_struct_start + 1);
+                                       }
+                                       else
+                                       {
+                                               nested_struct_end = field_size 
- 1;
+                                       }
+                                       arg_class2 = 
_jit_classify_structpart(field_type,
+                                                                               
                                  start + field_offset,
+                                                                               
                                  nested_struct_start,
+                                                                               
                                  nested_struct_end);
+                               }
+                               else
+                               {
+                                       if((start + start_offset) & (field_size 
- 1))
+                                       {
+                                               /* The field is misaligned */
+                                               return X86_64_ARG_MEMORY;
+                                       }
+                                       arg_class2 = 
_jit_classify_arg(field_type, 0);
+                               }
+                               if(arg_class == X86_64_ARG_NO_CLASS)
+                               {
+                                       arg_class = arg_class2;
+                               }
+                               else if(arg_class != arg_class2)
+                               {
+                                       if(arg_class == X86_64_ARG_MEMORY ||
+                                          arg_class2 == X86_64_ARG_MEMORY)
+                                       {
+                                               arg_class = X86_64_ARG_MEMORY;
+                                       }
+                                       else if(arg_class == X86_64_ARG_INTEGER 
||
+                                          arg_class2 == X86_64_ARG_INTEGER)
+                                       {
+                                               arg_class = X86_64_ARG_INTEGER;
+                                       }
+                                       else if(arg_class == X86_64_ARG_X87 ||
+                                          arg_class2 == X86_64_ARG_X87)
+                                       {
+                                               arg_class = X86_64_ARG_MEMORY;
+                                       }
+                                       else
+                                       {
+                                               arg_class = X86_64_ARG_SSE;
+                                       }
+                               }
+                       }
+               }
+       }
+       return arg_class;
+}
+
+static int
+_jit_classify_struct(jit_param_passing_t *passing,
+                                       _jit_param_t *param, jit_type_t 
param_type)
+{
+       jit_nuint size = (jit_nuint)jit_type_get_size(param_type);
+
+       if(size <= 8)
+       {
+               int arg_class;
+       
+               arg_class = _jit_classify_structpart(param_type, 0, 0, size - 
1);
+               if(arg_class == X86_64_ARG_NO_CLASS)
+               {
+                       arg_class = X86_64_ARG_SSE;
+               }
+               if(arg_class == X86_64_ARG_INTEGER)
+               {
+                       if(passing->word_index < passing->max_word_regs)
+                       {
+                               /* Set the arg class to the number of registers 
used */
+                               param->arg_class = 1;
+
+                               /* Set the first register to the register used 
*/
+                               param->un.reg[0] = 
passing->word_regs[passing->word_index];
+                               ++(passing->word_index);
+                       }
+                       else
+                       {
+                               /* Set the arg class to stack */
+                               param->arg_class = JIT_ARG_CLASS_STACK;
+
+                               /* Allocate the slot in the arg passing frame */
+                               _jit_alloc_param_slot(passing, param, 
param_type);
+                       }                       
+               }
+               else if(arg_class == X86_64_ARG_SSE)
+               {
+                       if(passing->float_index < passing->max_float_regs)
+                       {
+                               /* Set the arg class to the number of registers 
used */
+                               param->arg_class = 1;
+
+                               /* Set the first register to the register used 
*/
+                               param->un.reg[0] =      
passing->float_regs[passing->float_index];
+                               ++(passing->float_index);
+                       }
+                       else
+                       {
+                               /* Set the arg class to stack */
+                               param->arg_class = JIT_ARG_CLASS_STACK;
+
+                               /* Allocate the slot in the arg passing frame */
+                               _jit_alloc_param_slot(passing, param, 
param_type);
+                       }
+               }
+               else
+               {
+                       /* Set the arg class to stack */
+                       param->arg_class = JIT_ARG_CLASS_STACK;
+
+                       /* Allocate the slot in the arg passing frame */
+                       _jit_alloc_param_slot(passing, param, param_type);
+               }
+       }
+       else if(size <= 16)
+       {
+               int arg_class1;
+               int arg_class2;
+
+               arg_class1 = _jit_classify_structpart(param_type, 0, 0, 7);
+               arg_class2 = _jit_classify_structpart(param_type, 0, 8, size - 
1);
+               if(arg_class1 == X86_64_ARG_NO_CLASS)
+               {
+                       arg_class1 = X86_64_ARG_SSE;
+               }
+               if(arg_class2 == X86_64_ARG_NO_CLASS)
+               {
+                       arg_class2 = X86_64_ARG_SSE;
+               }
+               if(arg_class1 == X86_64_ARG_SSE && arg_class2 == X86_64_ARG_SSE)
+               {
+                       /* We use only one sse register in this case */
+                       if(passing->float_index < passing->max_float_regs)
+                       {
+                               /* Set the arg class to the number of registers 
used */
+                               param->arg_class = 1;
+
+                               /* Set the first register to the register used 
*/
+                               param->un.reg[0] =      
passing->float_regs[passing->float_index];
+                               ++(passing->float_index);
+                       }
+                       else
+                       {
+                               /* Set the arg class to stack */
+                               param->arg_class = JIT_ARG_CLASS_STACK;
+
+                               /* Allocate the slot in the arg passing frame */
+                               _jit_alloc_param_slot(passing, param, 
param_type);
+                       }
+               }
+               else if(arg_class1 == X86_64_ARG_MEMORY ||
+                               arg_class2 == X86_64_ARG_MEMORY)
+               {
+                       /* Set the arg class to stack */
+                       param->arg_class = JIT_ARG_CLASS_STACK;
+
+                       /* Allocate the slot in the arg passing frame */
+                       _jit_alloc_param_slot(passing, param, param_type);
+               }
+               else if(arg_class1 == X86_64_ARG_INTEGER &&
+                               arg_class2 == X86_64_ARG_INTEGER)
+               {
+                       /* We need two general purpose registers in this case */
+                       if((passing->word_index + 1) < passing->max_word_regs)
+                       {
+                               /* Set the arg class to the number of registers 
used */
+                               param->arg_class = 2;
+
+                               /* Assign the registers */
+                               param->un.reg[0] = 
passing->word_regs[passing->word_index];
+                               ++(passing->word_index);
+                               param->un.reg[1] = 
passing->word_regs[passing->word_index];
+                               ++(passing->word_index);
+                       }
+                       else
+                       {
+                               /* Set the arg class to stack */
+                               param->arg_class = JIT_ARG_CLASS_STACK;
+
+                               /* Allocate the slot in the arg passing frame */
+                               _jit_alloc_param_slot(passing, param, 
param_type);
+                       }                       
+               }
+               else
+               {
+                       /* We need one xmm and one general purpose register */
+                       if((passing->word_index < passing->max_word_regs) &&
+                          (passing->float_index < passing->max_float_regs))
+                       {
+                               /* Set the arg class to the number of registers 
used */
+                               param->arg_class = 2;
+
+                               if(arg_class1 == X86_64_ARG_INTEGER)
+                               {
+                                       param->un.reg[0] = 
passing->word_regs[passing->word_index];
+                                       ++(passing->word_index);
+                                       param->un.reg[1] =      
passing->float_regs[passing->float_index];
+                                       ++(passing->float_index);
+                               }
+                               else
+                               {
+                                       param->un.reg[0] =      
passing->float_regs[passing->float_index];
+                                       ++(passing->float_index);
+                                       param->un.reg[1] = 
passing->word_regs[passing->word_index];
+                                       ++(passing->word_index);
+                               }
+                       }
+                       else
+                       {
+                               /* Set the arg class to stack */
+                               param->arg_class = JIT_ARG_CLASS_STACK;
+
+                               /* Allocate the slot in the arg passing frame */
+                               _jit_alloc_param_slot(passing, param, 
param_type);
+                       }
+               }
+       }
+       else
+       {
+               /* Set the arg class to stack */
+               param->arg_class = JIT_ARG_CLASS_STACK;
+
+               /* Allocate the slot in the arg passing frame */
+               _jit_alloc_param_slot(passing, param, param_type);
+       }
+       return 1;
+}
+
+int
+_jit_classify_param(jit_param_passing_t *passing,
+                                       _jit_param_t *param, jit_type_t 
param_type)
+{
+       if(is_struct_or_union(param_type))
+       {
+               return _jit_classify_struct(passing, param, param_type);
+       }
+       else
+       {
+               int arg_class;
+
+               arg_class = _jit_classify_arg(param_type, 0);
+
+               switch(arg_class)
+               {
+                       case X86_64_ARG_INTEGER:
+                       {
+                               if(passing->word_index < passing->max_word_regs)
+                               {
+                                       /* Set the arg class to the number of 
registers used */
+                                       param->arg_class = 1;
+
+                                       /* Set the first register to the 
register used */
+                                       param->un.reg[0] = 
passing->word_regs[passing->word_index];
+                                       ++(passing->word_index);
+                               }
+                               else
+                               {
+                                       /* Set the arg class to stack */
+                                       param->arg_class = JIT_ARG_CLASS_STACK;
+
+                                       /* Allocate the slot in the arg passing 
frame */
+                                       _jit_alloc_param_slot(passing, param, 
param_type);
+                               }
+                       }
+                       break;
+
+                       case X86_64_ARG_SSE:
+                       {
+                               if(passing->float_index < 
passing->max_float_regs)
+                               {
+                                       /* Set the arg class to the number of 
registers used */
+                                       param->arg_class = 1;
+
+                                       /* Set the first register to the 
register used */
+                                       param->un.reg[0] =      
passing->float_regs[passing->float_index];
+                                       ++(passing->float_index);
+                               }
+                               else
+                               {
+                                       /* Set the arg class to stack */
+                                       param->arg_class = JIT_ARG_CLASS_STACK;
+
+                                       /* Allocate the slot in the arg passing 
frame */
+                                       _jit_alloc_param_slot(passing, param, 
param_type);
+                               }
+                       }
+                       break;
+
+                       case X86_64_ARG_MEMORY:
+                       {
+                               /* Set the arg class to stack */
+                               param->arg_class = JIT_ARG_CLASS_STACK;
+
+                               /* Allocate the slot in the arg passing frame */
+                               _jit_alloc_param_slot(passing, param, 
param_type);
+                       }
+                       break;
+               }
+       }
+       return 1;
+}
+
+static int
+_jit_classify_struct_return(jit_param_passing_t *passing,
+                                       _jit_param_t *param, jit_type_t 
return_type)
+{
+       /* Initialize the param passing structure */
+       jit_memset(passing, 0, sizeof(jit_param_passing_t));
+       jit_memset(param, 0, sizeof(_jit_param_t));
+
+       passing->word_regs = _jit_word_return_regs;
+       passing->max_word_regs = _jit_num_word_return_regs;
+       passing->float_regs = _jit_sse_return_regs;
+       passing->max_float_regs = _jit_num_sse_return_regs;
+
+       if(!(_jit_classify_struct(passing, param, return_type)))
+       {
+               return 0;
+       }
+
+       return 1;
+}
+
+/*
+ * Load a struct to the register(s) in which it will be returned.
+ */
+static unsigned char *
+return_struct(unsigned char *inst, jit_function_t func, int ptr_reg)
+{
+       jit_type_t return_type;
+       jit_type_t signature = jit_function_get_signature(func);
+
+       return_type = jit_type_get_return(signature);
+       if(is_struct_or_union(return_type))
+       {
+               jit_nuint size;
+               jit_param_passing_t passing;
+               _jit_param_t return_param;
+
+               if(!_jit_classify_struct_return(&passing, &return_param,
+                                                                               
return_type))
+               {
+                       /* It's an error so simply return insn */
+                       return inst;    
+               }
+               
+               size = jit_type_get_size(return_type);
+               if(size <= 8)
+               {
+                       /* one register is used for returning the value */
+                       if(IS_GENERAL_REG(return_param.un.reg[0]))
+                       {
+                               int reg = 
_jit_reg_info[return_param.un.reg[0]].cpu_reg;
+
+                               if(size <= 4)
+                               {
+                                       x86_64_mov_reg_regp_size(inst, reg, 
ptr_reg, 4);
+                               }
+                               else
+                               {
+                                       x86_64_mov_reg_regp_size(inst, reg, 
ptr_reg, 8);
+                               }
+                       }
+                       else
+                       {
+                               int reg = 
_jit_reg_info[return_param.un.reg[0]].cpu_reg;
+
+                               if(size <= 4)
+                               {
+                                       x86_64_movss_reg_regp(inst, reg, 
ptr_reg);
+                               }
+                               else
+                               {
+                                       x86_64_movsd_reg_regp(inst, reg, 
ptr_reg);
+                               }
+                       }
+               }
+               else
+               {
+                       /* In this case we might need up to two registers */
+                       if(return_param.arg_class == 1)
+                       {
+                               /* This must be one xmm register */
+                               int reg = 
_jit_reg_info[return_param.un.reg[0]].cpu_reg;
+                               int alignment = 
jit_type_get_alignment(return_type);
+
+                               if((alignment & 0xf) == 0)
+                               {
+                                       /* The type is aligned on a 16 byte 
boundary */
+                                       x86_64_movaps_reg_regp(inst, reg, 
ptr_reg);
+                               }
+                               else
+                               {
+                                       x86_64_movups_reg_regp(inst, reg, 
ptr_reg);
+                               }
+                       }
+                       else
+                       {
+                               int reg = 
_jit_reg_info[return_param.un.reg[0]].cpu_reg;
+
+                               if(IS_GENERAL_REG(return_param.un.reg[0]))
+                               {
+                                       x86_64_mov_reg_regp_size(inst, reg,
+                                                                               
         ptr_reg, 8);
+                               }
+                               else
+                               {
+                                       x86_64_movsd_reg_regp(inst, reg, 
ptr_reg);
+                               }
+                               size -= 8;
+                               reg = 
_jit_reg_info[return_param.un.reg[1]].cpu_reg;
+                               if(IS_GENERAL_REG(return_param.un.reg[1]))
+                               {
+                                       if(size <= 4)
+                                       {
+                                               
x86_64_mov_reg_membase_size(inst, reg, ptr_reg,
+                                                                               
                        8, 4);
+                                       }
+                                       else
+                                       {
+                                               
x86_64_mov_reg_membase_size(inst, reg, ptr_reg,
+                                                                               
                        8, 8);
+                                       }
+                               }
+                               else
+                               {
+                                       if(size <= 4)
+                                       {
+                                               x86_64_movss_reg_membase(inst, 
reg,
+                                                                               
                 ptr_reg, 8);
+                                       }
+                                       else
+                                       {
+                                               x86_64_movsd_reg_membase(inst, 
reg,
+                                                                               
                 ptr_reg, 8);
+                                       }
+                               }
+                       }
+               }
+       }
+       return inst;
+}
+
+/*
+ * Flush a struct return value from the registers to the value
+ * on the stack.
+ */
+static unsigned char *
+flush_return_struct(unsigned char *inst, jit_value_t value)
+{
+       jit_type_t return_type;
+
+       return_type = jit_value_get_type(value);
+       if(is_struct_or_union(return_type))
+       {
+               jit_nuint size;
+               jit_nint offset;
+               jit_param_passing_t passing;
+               _jit_param_t return_param;
+
+               if(!_jit_classify_struct_return(&passing, &return_param, 
return_type))
+               {
+                       /* It's an error so simply return insn */
+                       return inst;    
+               }
+
+               return_param.value = value;
+
+               _jit_gen_fix_value(value);
+               size = jit_type_get_size(return_type);
+               offset = value->frame_offset;
+               if(size <= 8)
+               {
+                       /* one register is used for returning the value */
+                       if(IS_GENERAL_REG(return_param.un.reg[0]))
+                       {
+                               int reg = 
_jit_reg_info[return_param.un.reg[0]].cpu_reg;
+
+                               if(size <= 4)
+                               {
+                                       x86_64_mov_membase_reg_size(inst, 
X86_64_RBP, offset, reg, 4);
+                               }
+                               else
+                               {
+                                       x86_64_mov_membase_reg_size(inst, 
X86_64_RBP, offset, reg, 8);
+                               }
+                       }
+                       else
+                       {
+                               int reg = 
_jit_reg_info[return_param.un.reg[0]].cpu_reg;
+
+                               if(size <= 4)
+                               {
+                                       x86_64_movss_membase_reg(inst, 
X86_64_RBP, offset, reg);
+                               }
+                               else
+                               {
+                                       x86_64_movsd_membase_reg(inst, 
X86_64_RBP, offset, reg);
+                               }
+                       }
+               }
+               else
+               {
+                       /* In this case we might need up to two registers */
+                       if(return_param.arg_class == 1)
+                       {
+                               /* This must be one xmm register */
+                               int reg = 
_jit_reg_info[return_param.un.reg[0]].cpu_reg;
+                               int alignment = 
jit_type_get_alignment(return_type);
+
+                               if((alignment & 0xf) == 0)
+                               {
+                                       /* The type is aligned on a 16 byte 
boundary */
+                                       x86_64_movaps_membase_reg(inst, 
X86_64_RBP, offset, reg);
+                               }
+                               else
+                               {
+                                       x86_64_movups_membase_reg(inst, 
X86_64_RBP, offset, reg);
+                               }
+                       }
+                       else
+                       {
+                               int reg = 
_jit_reg_info[return_param.un.reg[0]].cpu_reg;
+
+                               if(IS_GENERAL_REG(return_param.un.reg[0]))
+                               {
+                                       x86_64_mov_membase_reg_size(inst, 
X86_64_RBP, offset,
+                                                                               
                reg, 8);
+                               }
+                               else
+                               {
+                                       x86_64_movsd_membase_reg(inst, 
X86_64_RBP, offset, reg);
+                               }
+                               size -= 8;
+                               reg = 
_jit_reg_info[return_param.un.reg[1]].cpu_reg;
+                               if(IS_GENERAL_REG(return_param.un.reg[1]))
+                               {
+                                       if(size <= 4)
+                                       {
+                                               
x86_64_mov_membase_reg_size(inst, X86_64_RBP,
+                                                                               
                        offset + 8, reg, 4);
+                                       }
+                                       else
+                                       {
+                                               
x86_64_mov_membase_reg_size(inst, X86_64_RBP,
+                                                                               
                        offset + 8, reg, 8);
+                                       }
+                               }
+                               else
+                               {
+                                       if(size <= 4)
+                                       {
+                                               x86_64_movss_membase_reg(inst, 
X86_64_RBP,
+                                                                               
                 offset + 8, reg);
+                                       }
+                                       else
+                                       {
+                                               x86_64_movsd_membase_reg(inst, 
X86_64_RBP,
+                                                                               
                 offset + 8, reg);
+                                       }
+                               }
+                       }
+               }
+       }
+       return inst;
+}
+
+#define        TODO()          \
+       do { \
+               fprintf(stderr, "TODO at %s, %d\n", __FILE__, (int)__LINE__); \
+       } while (0)
+
+void
+_jit_gen_insn(jit_gencode_t gen, jit_function_t func,
+                         jit_block_t block, jit_insn_t insn)
+{
+       switch(insn->opcode)
+       {
+       #define JIT_INCLUDE_RULES
+       #include "jit-rules-x86-64.inc"
+       #undef JIT_INCLUDE_RULES
+
+       default:
+               {
+                       fprintf(stderr, "TODO(%x) at %s, %d\n",
+                               (int)(insn->opcode), __FILE__, (int)__LINE__);
+               }
+               break;
+       }
+}
+
+/*
+ * Fixup the passing area after all parameters have been allocated either
+ * in registers or on the stack.
+ * This is typically used for adding pad words for keeping the stack aligned.
+ */
+void
+_jit_fix_call_stack(jit_param_passing_t *passing)
+{
+       if((passing->stack_size & 0x0f) != 0)
+       {
+               passing->stack_size = (passing->stack_size + 0x0f) & 
~((jit_nint)0x0f);
+               passing->stack_pad = 1;
+       }
+}
+
+/*
+ * Setup the call stack before pushing any parameters.
+ * This is used usually for pushing pad words for alignment.
+ * The function is needed only if the backend doesn't work with the
+ * parameter area.
+ */
+int
+_jit_setup_call_stack(jit_function_t func, jit_param_passing_t *passing)
+{
+       if(passing->stack_pad)
+       {
+               int current;
+               jit_value_t pad_value;
+
+               pad_value = jit_value_create_nint_constant(func, jit_type_nint, 
0);
+               if(!pad_value)
+               {
+                       return 0;
+               }
+               for(current = 0; current < passing->stack_pad; ++current)
+               {
+                       if(!jit_insn_push(func, pad_value))
+                       {
+                               return 0;
+                       }
+               }
+       }
+       return 1;
+}
+
+/*
+ * Push a parameter onto the stack.
+ */
+static int
+push_param(jit_function_t func, _jit_param_t *param, jit_type_t type)
+{
+       if(is_struct_or_union(type) && !is_struct_or_union(param->value->type))
+       {
+               jit_value_t value;
+
+               if(!(value = jit_insn_address_of(func, param->value)))
+               {
+                       return 0;
+               }
+       #ifdef JIT_USE_PARAM_AREA
+               /* Copy the value into the outgoing parameter area, by pointer 
*/
+               if(!jit_insn_set_param_ptr(func, value, type, param->un.offset))
+               {
+                       return 0;
+               }
+       #else
+               /* Push the parameter value onto the stack, by pointer */
+               if(!jit_insn_push_ptr(func, value, type))
+               {
+                       return 0;
+               }
+               if(param->stack_pad)
+               {
+                       int current;
+                       jit_value_t pad_value;
+
+                       pad_value = jit_value_create_nint_constant(func, 
jit_type_nint, 0);
+                       if(!pad_value)
+                       {
+                               return 0;
+                       }
+                       for(current = 0; current < param->stack_pad; ++current)
+                       {
+                               if(!jit_insn_push(func, pad_value))
+                               {
+                                       return 0;
+                               }
+                       }
+               }
+       #endif
+       }
+       else
+       {
+       #ifdef JIT_USE_PARAM_AREA
+               /* Copy the value into the outgoing parameter area */
+               if(!jit_insn_set_param(func, param->value, param->un.offset))
+               {
+                       return 0;
+               }
+       #else
+               /* Push the parameter value onto the stack */
+               if(!jit_insn_push(func, param->value))
+               {
+                       return 0;
+               }
+               if(param->stack_pad)
+               {
+                       int current;
+                       jit_value_t pad_value;
+
+                       pad_value = jit_value_create_nint_constant(func, 
jit_type_nint, 0);
+                       if(!pad_value)
+                       {
+                               return 0;
+                       }
+                       for(current = 0; current < param->stack_pad; ++current)
+                       {
+                               if(!jit_insn_push(func, pad_value))
+                               {
+                                       return 0;
+                               }
+                       }
+               }
+       #endif
+       }
+       return 1;
+}
+
+int
+_jit_setup_incoming_param(jit_function_t func, _jit_param_t *param,
+                                                 jit_type_t param_type)
+{
+       if(param->arg_class == JIT_ARG_CLASS_STACK)
+       {
+               /* The parameter is passed on the stack */
+               if(!jit_insn_incoming_frame_posn
+                               (func, param->value, param->un.offset))
+               {
+                       return 0;
+               }
+       }
+       else
+       {
+               param_type = jit_type_remove_tags(param_type);
+
+               switch(param_type->kind)
+               {
+                       case JIT_TYPE_STRUCT:
+                       case JIT_TYPE_UNION:
+                       {
+                               if(param->arg_class == 1)
+                               {
+                                       if(!jit_insn_incoming_reg(func, 
param->value, param->un.reg[0]))
+                                       {
+                                               return 0;
+                                       }
+                               }
+                               else
+                               {
+                                       /* These cases have to be handled 
specially */
+                               }
+                       }
+                       break;
+
+                       default:
+                       {
+                               if(!jit_insn_incoming_reg(func, param->value, 
param->un.reg[0]))
+                               {
+                                       return 0;
+                               }
+                       }
+                       break;
+               }
+       }
+       return 1;
+}
+
+int
+_jit_setup_outgoing_param(jit_function_t func, _jit_param_t *param,
+                                                 jit_type_t param_type)
+{
+       if(param->arg_class == JIT_ARG_CLASS_STACK)
+       {
+               /* The parameter is passed on the stack */
+               if(!push_param(func, param, param_type))
+               {
+                       return 0;
+               }
+       }
+       else
+       {
+               param_type = jit_type_remove_tags(param_type);
+
+               switch(param_type->kind)
+               {
+                       case JIT_TYPE_STRUCT:
+                       case JIT_TYPE_UNION:
+                       {
+                               /* These cases have to be handled specially */
+                               if(param->arg_class == 1)
+                               {
+                                       /* Only one xmm register is used for 
passing this argument */
+                                       if(!jit_insn_outgoing_reg(func, 
param->value, param->un.reg[0]))
+                                       {
+                                               return 0;
+                                       }
+                               }
+                               else
+                               {
+                                       /* We need two registers for passing 
the value */
+                                       jit_nuint size = 
(jit_nuint)jit_type_get_size(param_type);
+
+                                       jit_value_t struct_ptr;
+
+                                       if(!(struct_ptr = 
jit_insn_address_of(func, param->value)))
+                                       {
+                                               return 0;
+                                       }
+                                       if(IS_GENERAL_REG(param->un.reg[0]))
+                                       {
+                                               jit_value_t param_value;
+
+                                               param_value = 
jit_insn_load_relative(func, struct_ptr,
+                                                                               
                                         0, jit_type_ulong);
+                                               if(!param_value)
+                                               {
+                                                       return 0;
+                                               }
+                                               if(!jit_insn_outgoing_reg(func, 
param_value, param->un.reg[0]))
+                                               {
+                                                       return 0;
+                                               }
+                                       }
+                                       else
+                                       {
+                                               jit_value_t param_value;
+
+                                               param_value = 
jit_insn_load_relative(func, struct_ptr,
+                                                                               
                                         0, jit_type_float64);
+                                               if(!param_value)
+                                               {
+                                                       return 0;
+                                               }
+                                               if(!jit_insn_outgoing_reg(func, 
param_value, param->un.reg[0]))
+                                               {
+                                                       return 0;
+                                               }
+                                       }
+                                       size -= 8;
+                                       if(IS_GENERAL_REG(param->un.reg[1]))
+                                       {
+                                               if(size == 1)
+                                               {
+                                                       jit_value_t param_value;
+
+                                                       param_value = 
jit_insn_load_relative(func, struct_ptr,
+                                                                               
                                                 8, jit_type_ubyte);
+                                                       if(!param_value)
+                                                       {
+                                                               return 0;
+                                                       }
+                                                       
if(!jit_insn_outgoing_reg(func, param_value, param->un.reg[1]))
+                                                       {
+                                                               return 0;
+                                                       }
+                                               }
+                                               else if(size == 2)
+                                               {
+                                                       jit_value_t param_value;
+
+                                                       param_value = 
jit_insn_load_relative(func, struct_ptr,
+                                                                               
                                                 8, jit_type_ushort);
+                                                       if(!param_value)
+                                                       {
+                                                               return 0;
+                                                       }
+                                                       
if(!jit_insn_outgoing_reg(func, param_value, param->un.reg[0]))
+                                                       {
+                                                               return 0;
+                                                       }
+                                               }
+                                               else if(size <= 4)
+                                               {
+                                                       jit_value_t param_value;
+
+                                                       param_value = 
jit_insn_load_relative(func, struct_ptr,
+                                                                               
                                                 8, jit_type_uint);
+                                                       if(!param_value)
+                                                       {
+                                                               return 0;
+                                                       }
+                                                       
if(!jit_insn_outgoing_reg(func, param_value, param->un.reg[0]))
+                                                       {
+                                                               return 0;
+                                                       }
+                                               }
+                                               else
+                                               {
+                                                       jit_value_t param_value;
+
+                                                       param_value = 
jit_insn_load_relative(func, struct_ptr,
+                                                                               
                                                 8, jit_type_ulong);
+                                                       if(!param_value)
+                                                       {
+                                                               return 0;
+                                                       }
+                                                       
if(!jit_insn_outgoing_reg(func, param_value, param->un.reg[0]))
+                                                       {
+                                                               return 0;
+                                                       }
+                                               }
+                                       }
+                                       else
+                                       {
+                                               if(size <= 4)
+                                               {
+                                                       jit_value_t param_value;
+
+                                                       param_value = 
jit_insn_load_relative(func, struct_ptr,
+                                                                               
                                                 8, jit_type_float32);
+                                                       if(!param_value)
+                                                       {
+                                                               return 0;
+                                                       }
+                                                       
if(!jit_insn_outgoing_reg(func, param_value, param->un.reg[0]))
+                                                       {
+                                                               return 0;
+                                                       }
+                                               }
+                                               else
+                                               {
+                                                       jit_value_t param_value;
+
+                                                       param_value = 
jit_insn_load_relative(func, struct_ptr,
+                                                                               
                                                 8, jit_type_float64);
+                                                       if(!param_value)
+                                                       {
+                                                               return 0;
+                                                       }
+                                                       
if(!jit_insn_outgoing_reg(func, param_value, param->un.reg[0]))
+                                                       {
+                                                               return 0;
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       break;
+
+                       default:
+                       {
+                               if(!jit_insn_outgoing_reg(func, param->value, 
param->un.reg[0]))
+                               {
+                                       return 0;
+                               }
+                       }
+                       break;
+               }
+       }
+       return 1;
+}
+
+int
+_jit_setup_return_value(jit_function_t func, jit_value_t return_value,
+                                               jit_type_t return_type)
+
+{
+       /* Structure values must be flushed into the frame, and
+          everything else ends up in a register */
+       if(is_struct_or_union(return_type))
+       {
+               jit_param_passing_t passing;
+               _jit_param_t return_param;
+
+               if(!_jit_classify_struct_return(&passing, &return_param, 
return_type))
+               {
+                       /* It's an error so simply return insn */
+                       return 0;       
+               }
+
+               if(return_param.arg_class == 1)
+               {
+                       if(!jit_insn_return_reg(func, return_value,
+                                                                       
return_param.un.reg[0]))
+                       {
+                               return 0;
+                       }
+               }
+               else
+               {
+                       if(!jit_insn_flush_struct(func, return_value))
+                       {
+                               return 0;
+                       }
+               }
+       }
+       else if(return_type == jit_type_float32 ||
+                       return_type == jit_type_float64)
+       {
+               if(!jit_insn_return_reg(func, return_value, X86_64_REG_XMM0))
+               {
+                       return 0;
+               }
+       }
+       else if(return_type == jit_type_nfloat)
+       {
+               if(!jit_insn_return_reg(func, return_value, X86_64_REG_ST0))
+               {
+                       return 0;
+               }
+       }
+       else if(return_type->kind != JIT_TYPE_VOID)
+       {
+               if(!jit_insn_return_reg(func, return_value, X86_64_REG_RAX))
+               {
+                       return 0;
+               }
+       }
+       return 1;
+}
+
+void
+_jit_init_args(int abi, jit_param_passing_t *passing)
+{
+       passing->max_word_regs = _jit_num_word_regs;
+       passing->word_regs = _jit_word_arg_regs;
+       passing->max_float_regs = _jit_num_float_regs;
+       passing->float_regs = _jit_float_arg_regs;
+}
+
+int
+_jit_create_entry_insns(jit_function_t func)
+{
+       jit_type_t signature = func->signature;
+       int abi = jit_type_get_abi(signature);
+       unsigned int num_args = jit_type_num_params(signature);
+       jit_param_passing_t passing;
+       _jit_param_t param[num_args];
+       _jit_param_t nested_param;
+       _jit_param_t struct_return_param;
+       int current_param;
+
+       /* Reset the local variable frame size for this function */
+       func->builder->frame_size = JIT_INITIAL_FRAME_SIZE;
+
+       /* Initialize the param passing structure */
+       jit_memset(&passing, 0, sizeof(jit_param_passing_t));
+       jit_memset(param, 0, sizeof(_jit_param_t) * num_args);
+
+       passing.params = param;
+       passing.stack_size = JIT_INITIAL_STACK_OFFSET;
+
+       /* Let the specific backend initialize it's part of the params */
+       _jit_init_args(abi, &passing);
+
+       /* If the function is nested, then we need an extra parameter
+          to pass the pointer to the parent's local variable frame */
+       if(func->nested_parent)
+       {
+               jit_memset(&nested_param, 0, sizeof(_jit_param_t));
+               if(!(_jit_classify_param(&passing, &nested_param,
+                                                                
jit_type_void_ptr)))
+               {
+                       return 0;
+               }
+       }
+
+       /* Allocate the structure return pointer */
+       if(jit_value_get_struct_pointer(func))
+       {
+               jit_memset(&struct_return_param, 0, sizeof(_jit_param_t));
+               if(!(_jit_classify_param(&passing, &struct_return_param,
+                                                                
jit_type_void_ptr)))
+               {
+                       return 0;
+               }
+       }
+
+       /* Let the backend classify the parameters */
+       for(current_param = 0; current_param < num_args; current_param++)
+       {
+               jit_type_t param_type;
+
+               param_type = jit_type_get_param(signature, current_param);
+               param_type = jit_type_normalize(param_type);
+               
+               if(!(_jit_classify_param(&passing, 
&(passing.params[current_param]),
+                                                                param_type)))
+               {
+                       return 0;
+               }
+       }
+
+       /* Now we can setup the incoming parameters */
+       for(current_param = 0; current_param < num_args; current_param++)
+       {
+               jit_type_t param_type;
+
+               param_type = jit_type_get_param(signature, current_param);
+               if(!(param[current_param].value))
+               {
+                       if(!(param[current_param].value = 
jit_value_get_param(func, current_param)))
+                       {
+                               return 0;
+                       }
+               }
+               if(!_jit_setup_incoming_param(func, &(param[current_param]), 
param_type))
+               {
+                       return 0;
+               }
+       }
+
+       return 1;
+}
+
+int _jit_create_call_setup_insns
+       (jit_function_t func, jit_type_t signature,
+        jit_value_t *args, unsigned int num_args,
+        int is_nested, int nesting_level, jit_value_t *struct_return, int 
flags)
+{
+       int abi = jit_type_get_abi(signature);
+       jit_type_t return_type;
+       jit_value_t value;
+       jit_value_t return_ptr;
+       int current_param;
+       jit_param_passing_t passing;
+       _jit_param_t param[num_args];
+       _jit_param_t nested_param;
+       _jit_param_t struct_return_param;
+
+       /* Initialize the param passing structure */
+       jit_memset(&passing, 0, sizeof(jit_param_passing_t));
+       jit_memset(param, 0, sizeof(_jit_param_t) * num_args);
+
+       passing.params = param;
+       passing.stack_size = 0;
+
+       /* Let the specific backend initialize it's part of the params */
+       _jit_init_args(abi, &passing);
+
+       /* Determine how many parameters are going to end up in word registers,
+          and compute the largest stack size needed to pass stack parameters */
+       if(is_nested)
+       {
+               jit_memset(&nested_param, 0, sizeof(_jit_param_t));
+               if(!(_jit_classify_param(&passing, &nested_param,
+                                                                
jit_type_void_ptr)))
+               {
+                       return 0;
+               }
+       }
+
+       /* Determine if we need an extra hidden parameter for returning a 
+          structure */
+       return_type = jit_type_get_return(signature);
+       if(jit_type_return_via_pointer(return_type))
+       {
+               value = jit_value_create(func, return_type);
+               if(!value)
+               {
+                       return 0;
+               }
+               *struct_return = value;
+               return_ptr = jit_insn_address_of(func, value);
+               if(!return_ptr)
+               {
+                       return 0;
+               }
+               jit_memset(&struct_return_param, 0, sizeof(_jit_param_t));
+               if(!(_jit_classify_param(&passing, &struct_return_param,
+                                                                
jit_type_void_ptr)))
+               {
+                       return 0;
+               }
+               struct_return_param.value = return_ptr;
+       }
+       else
+       {
+               *struct_return = 0;
+               return_ptr = 0;
+       }
+
+       /* Let the backend classify the parameters */
+       for(current_param = 0; current_param < num_args; current_param++)
+       {
+               jit_type_t param_type;
+
+               param_type = jit_type_get_param(signature, current_param);
+               param_type = jit_type_normalize(param_type);
+               
+               if(!(_jit_classify_param(&passing, 
&(passing.params[current_param]),
+                                                                param_type)))
+               {
+                       return 0;
+               }
+               /* Set the argument value */
+               passing.params[current_param].value = args[current_param];
+       }
+
+#ifdef JIT_USE_PARAM_AREA
+       if(passing.stack_size > func->builder->param_area_size)
+       {
+               func->builder->param_area_size = passing.stack_size;
+       }
+#else
+       /* Let the backend do final adjustments to the passing area */
+       _jit_fix_call_stack(&passing);
+
+       /* Flush deferred stack pops from previous calls if too many
+          parameters have collected up on the stack since last time */
+       if(!jit_insn_flush_defer_pop(func, 32 - passing.stack_size))
+       {
+               return 0;
+       }
+
+       if(!_jit_setup_call_stack(func, &passing))
+       {
+               return 0;
+       }
+#endif
+
+       /* Now setup the arguments on the stack or in the registers in reverse 
order */
+       current_param = num_args;
+       while(current_param > 0)
+       {
+               jit_type_t param_type;
+
+               --current_param;
+               param_type = jit_type_get_param(signature, current_param);
+               if(!_jit_setup_outgoing_param(func, &(param[current_param]), 
param_type))
+               {
+                       return 0;
+               }
+       }
+
+       /* Add the structure return pointer if required */
+       if(return_ptr)
+       {
+               if(!_jit_setup_outgoing_param(func, &struct_return_param, 
return_type))
+               {
+                       return 0;
+               }
+       }
+
+       return 1;
+}
+
+int
+_jit_create_call_return_insns(jit_function_t func, jit_type_t signature,
+                                                         jit_value_t *args, 
unsigned int num_args,
+                                                         jit_value_t 
return_value, int is_nested)
+{
+       int abi = jit_type_get_abi(signature);
+       jit_type_t return_type;
+       int ptr_return;
+       int current_param;
+#ifndef JIT_USE_PARAM_AREA
+       jit_param_passing_t passing;
+       _jit_param_t param[num_args];
+       _jit_param_t nested_param;
+       _jit_param_t struct_return_param;
+#endif /* !JIT_USE_PARAM_AREA */
+
+       return_type = jit_type_normalize(jit_type_get_return(signature));
+       ptr_return = jit_type_return_via_pointer(return_type);
+#ifndef JIT_USE_PARAM_AREA
+       /* Initialize the param passing structure */
+       jit_memset(&passing, 0, sizeof(jit_param_passing_t));
+       jit_memset(param, 0, sizeof(_jit_param_t) * num_args);
+
+       passing.params = param;
+       passing.stack_size = 0;
+
+       /* Let the specific backend initialize it's part of the params */
+       _jit_init_args(abi, &passing);
+
+       /* Determine how many parameters are going to end up in word registers,
+          and compute the largest stack size needed to pass stack parameters */
+       if(is_nested)
+       {
+               jit_memset(&nested_param, 0, sizeof(_jit_param_t));
+               if(!(_jit_classify_param(&passing, &nested_param,
+                                                                
jit_type_void_ptr)))
+               {
+                       return 0;
+               }
+       }
+
+       /* Determine if we need an extra hidden parameter for returning a 
+          structure */
+       if(ptr_return)
+       {
+               jit_memset(&struct_return_param, 0, sizeof(_jit_param_t));
+               if(!(_jit_classify_param(&passing, &struct_return_param,
+                                                                
jit_type_void_ptr)))
+               {
+                       return 0;
+               }
+       }
+
+       /* Let the backend classify the parameters */
+       for(current_param = 0; current_param < num_args; current_param++)
+       {
+               jit_type_t param_type;
+
+               param_type = jit_type_get_param(signature, current_param);
+               param_type = jit_type_normalize(param_type);
+               
+               if(!(_jit_classify_param(&passing, 
&(passing.params[current_param]),
+                                                                param_type)))
+               {
+                       return 0;
+               }
+       }
+
+       /* Let the backend do final adjustments to the passing area */
+       _jit_fix_call_stack(&passing);
+
+       /* Pop the bytes from the system stack */
+       if(passing.stack_size > 0)
+       {
+               if(!jit_insn_defer_pop_stack(func, passing.stack_size))
+               {
+                       return 0;
+               }
+       }
+#endif /* !JIT_USE_PARAM_AREA */
+
+       /* Bail out now if we don't need to worry about return values */
+       if(!return_value || ptr_return)
+       {
+               return 1;
+       }
+
+       if(!_jit_setup_return_value(func, return_value, return_type))
+       {
+               return 0;
+       }
+
+       /* Everything is back where it needs to be */
+       return 1;
+}
+
+#endif /* JIT_BACKEND_X86_64 */

Index: jit/jit-rules-x86-64.h
===================================================================
RCS file: jit/jit-rules-x86-64.h
diff -N jit/jit-rules-x86-64.h
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ jit/jit-rules-x86-64.h      2 Mar 2008 17:07:06 -0000       1.1
@@ -0,0 +1,126 @@
+/*
+ * jit-rules-x86-64.h - Rules that define the characteristics of the x86_64.
+ *
+ * Copyright (C) 2008  Southern Storm Software, Pty Ltd.
+ *
+ * This file is part of the libjit library.
+ *
+ * The libjit library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation, either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * The libjit library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with the libjit library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef        _JIT_RULES_X86_64_H
+#define        _JIT_RULES_X86_64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Information about all of the registers, in allocation order.
+ */
+#define        JIT_REG_X86_64_FLOAT    \
+       (JIT_REG_FLOAT32 | JIT_REG_FLOAT64 | JIT_REG_NFLOAT)
+#define        JIT_REG_X86_64_XMM      \
+       (JIT_REG_FLOAT32 | JIT_REG_FLOAT64)
+#define JIT_REG_X86_64_GENERAL \
+       (JIT_REG_WORD | JIT_REG_LONG)
+#define        JIT_REG_INFO    \
+       {"rax", 0, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"rcx", 1, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"rdx", 2, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"rbx", 3, -1, JIT_REG_X86_64_GENERAL | JIT_REG_GLOBAL}, \
+       {"rsi", 6, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"rdi", 7, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"r8",  8, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"r9",  9, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"r10", 10, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"r11", 11, -1, JIT_REG_X86_64_GENERAL | JIT_REG_CALL_USED}, \
+       {"r12", 12, -1, JIT_REG_X86_64_GENERAL | JIT_REG_GLOBAL}, \
+       {"r13", 13, -1, JIT_REG_X86_64_GENERAL | JIT_REG_GLOBAL}, \
+       {"r14", 14, -1, JIT_REG_X86_64_GENERAL | JIT_REG_GLOBAL}, \
+       {"r15", 15, -1, JIT_REG_X86_64_GENERAL | JIT_REG_GLOBAL}, \
+       {"rbp", 5, -1, JIT_REG_FRAME | JIT_REG_FIXED | JIT_REG_CALL_USED}, \
+       {"rsp", 4, -1, JIT_REG_STACK_PTR | JIT_REG_FIXED | JIT_REG_CALL_USED}, \
+       {"xmm0", 0, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm1", 1, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm2", 2, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm3", 3, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm4", 4, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm5", 5, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm6", 6, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm7", 7, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm8", 8, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm9", 9, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm10", 10, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm11", 11, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm12", 12, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm13", 13, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm14", 14, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"xmm15", 15, -1, JIT_REG_X86_64_XMM | JIT_REG_CALL_USED}, \
+       {"st0", 0, -1, JIT_REG_X86_64_FLOAT | JIT_REG_CALL_USED | 
JIT_REG_IN_STACK}, \
+       {"st1", 1, -1, JIT_REG_X86_64_FLOAT | JIT_REG_CALL_USED | 
JIT_REG_IN_STACK}, \
+       {"st2", 2, -1, JIT_REG_X86_64_FLOAT | JIT_REG_CALL_USED | 
JIT_REG_IN_STACK}, \
+       {"st3", 3, -1, JIT_REG_X86_64_FLOAT | JIT_REG_CALL_USED | 
JIT_REG_IN_STACK}, \
+       {"st4", 4, -1, JIT_REG_X86_64_FLOAT | JIT_REG_CALL_USED | 
JIT_REG_IN_STACK}, \
+       {"st5", 5, -1, JIT_REG_X86_64_FLOAT | JIT_REG_CALL_USED | 
JIT_REG_IN_STACK}, \
+       {"st6", 6, -1, JIT_REG_X86_64_FLOAT | JIT_REG_CALL_USED | 
JIT_REG_IN_STACK}, \
+       {"st7", 7, -1, JIT_REG_X86_64_FLOAT | JIT_REG_CALL_USED | 
JIT_REG_IN_STACK},
+#define        JIT_NUM_REGS            40
+#define        JIT_NUM_GLOBAL_REGS     5
+
+#define JIT_REG_STACK          1
+#define JIT_REG_STACK_START    32
+#define JIT_REG_STACK_END      39
+
+/*
+ * Define to 1 if we should always load values into registers
+ * before operating on them.  i.e. the CPU does not have reg-mem
+ * and mem-reg addressing modes.
+ */
+#define        JIT_ALWAYS_REG_REG              0
+
+/*
+ * The maximum number of bytes to allocate for the prolog.
+ * This may be shortened once we know the true prolog size.
+ */
+#define        JIT_PROLOG_SIZE                 64
+
+/*
+ * Preferred alignment for the start of functions.
+ */
+#define        JIT_FUNCTION_ALIGNMENT  32
+
+/*
+ * Define this to 1 if the platform allows reads and writes on
+ * any byte boundary.  Define to 0 if only properly-aligned
+ * memory accesses are allowed.
+ */
+#define        JIT_ALIGN_OVERRIDES             1
+
+/*
+ * Parameter passing rules.
+ */
+/*
+#define        JIT_CDECL_WORD_REG_PARAMS               {5, 4, 2, 1, 6, 7, -1}
+#define        JIT_MAX_WORD_REG_PARAMS                 6
+*/
+#define        JIT_INITIAL_STACK_OFFSET                (2 * sizeof(void *))
+#define        JIT_INITIAL_FRAME_SIZE                  0
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif /* _JIT_RULES_X86_64_H */

Index: jit/jit-rules-x86-64.ins
===================================================================
RCS file: jit/jit-rules-x86-64.ins
diff -N jit/jit-rules-x86-64.ins
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ jit/jit-rules-x86-64.ins    2 Mar 2008 17:07:06 -0000       1.1
@@ -0,0 +1,1941 @@
+/*
+ * jit-rules-x86-64.ins - Instruction selector for x86_64.
+ *
+ * Copyright (C) 2008  Southern Storm Software, Pty Ltd.
+ *
+ * This file is part of the libjit library.
+ *
+ * The libjit library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation, either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * The libjit library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with the libjit library.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+ 
+%regclass reg x86_64_reg
+%regclass creg x86_64_creg
+%regclass rreg x86_64_rreg
+%regclass freg x86_64_freg
+%regclass xreg x86_64_xreg
+ 
+/*
+ * Conversion opcodes.
+ */
+
+JIT_OP_TRUNC_SBYTE:
+       [=reg, reg] -> {
+               x86_64_movsx8_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_TRUNC_UBYTE:
+       [=reg, reg] -> {
+               x86_64_movzx8_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_TRUNC_SHORT:
+       [=reg, reg] -> {
+               x86_64_movsx16_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_TRUNC_USHORT:
+       [=reg, reg] -> {
+               x86_64_movzx16_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_TRUNC_INT:
+       [=reg, reg] -> {
+               if($1 != $2)
+               {
+                       x86_64_mov_reg_reg_size(inst, $1, $2, 4);
+               }
+       }
+
+JIT_OP_TRUNC_UINT:
+       [=reg, reg] -> {
+               if($1 != $2)
+               {
+                       x86_64_mov_reg_reg_size(inst, $1, $2, 4);
+               }
+       }
+
+JIT_OP_LOW_WORD:
+       [=reg, imm] -> {
+               x86_64_mov_reg_imm_size(inst, $1, $2, 4);
+       }
+       [=reg, local] -> {
+               x86_64_mov_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+       }
+       [=reg, reg] -> {
+               if($1 != $2)
+               {
+                       x86_64_mov_reg_reg_size(inst, $1, $2, 4);
+               }
+       }
+       
+JIT_OP_EXPAND_INT:
+       [=reg, reg] -> {
+               x86_64_movsx32_reg_reg_size(inst, $1, $2, 8);
+       }
+
+JIT_OP_EXPAND_UINT:
+       [=reg, reg] -> {
+               x86_64_mov_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_NFLOAT_TO_INT: stack
+       [=reg, freg] -> {
+               /* allocate space on the stack for 2 shorts and 1 int */
+               x86_64_sub_reg_imm_size(inst, X86_ESP, 8, 8);
+               /* store FPU control word */
+               x86_64_fnstcw_membase(inst, X86_64_RSP, 0);
+               /* set "round toward zero" mode */
+               x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 0, 2);
+               x86_64_or_reg_imm_size(inst, $1, 0xc00, 2);
+               x86_64_mov_membase_reg_size(inst, X86_64_RSP, 2, $1, 2);
+               x86_64_fldcw_membase(inst, X86_64_RSP, 2);
+               /* convert float to int */
+               x86_64_fistp_membase_size(inst, X86_64_RSP, 4, 4);
+               /* restore FPU control word */
+               x86_64_fldcw_membase(inst, X86_64_RSP, 0);
+               /* move result to the destination */
+               x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 4, 4);
+               /* restore the stack */
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8);
+       }
+
+JIT_OP_NFLOAT_TO_LONG: stack
+       [=reg, freg] -> {
+               /* allocate space on the stack for 2 shorts and 1 long */
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 12, 8);
+               /* store FPU control word */
+               x86_64_fnstcw_membase(inst, X86_64_RSP, 0);
+               /* set "round toward zero" mode */
+               x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 0, 2);
+               x86_64_or_reg_imm_size(inst, $1, 0xc00, 2);
+               x86_64_mov_membase_reg_size(inst, X86_64_RSP, 2, $1, 2);
+               x86_64_fldcw_membase(inst, X86_64_RSP, 2);
+               /* convert float to long */
+               x86_64_fistp_membase_size(inst, X86_64_RSP, 4, 8);
+               /* restore FPU control word */
+               x86_64_fldcw_membase(inst, X86_64_RSP, 0);
+               /* move result to the destination */
+               x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 4, 8);
+               /* restore the stack */
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, 12, 8);
+       }
+
+JIT_OP_NFLOAT_TO_FLOAT32: stack
+       [=xreg, freg] -> {
+               /* Avoid modifying the stack pointer by simply using negative */
+               /* offsets here. */
+               x86_64_fstp_membase_size(inst, X86_64_RSP, -8, 4);
+               x86_64_movss_reg_membase(inst, $1, X86_64_RSP, -8);
+       }
+
+JIT_OP_NFLOAT_TO_FLOAT64: stack
+       [=xreg, freg] -> {
+               /* Avoid modifying the stack pointer by simply using negative */
+               /* offsets here. */
+               x86_64_fstp_membase_size(inst, X86_64_RSP, -8, 8);
+               x86_64_movsd_reg_membase(inst, $1, X86_64_RSP, -8);
+       }
+
+/*
+ * Data manipulation.
+ */
+
+JIT_OP_COPY_LOAD_SBYTE, JIT_OP_COPY_LOAD_UBYTE, JIT_OP_COPY_STORE_BYTE: copy
+       [=local, imm] -> {
+               x86_64_mov_membase_imm_size(inst, X86_64_RBP, $1, $2, 1);
+       }
+       [=local, reg] -> {
+               x86_64_mov_membase_reg_size(inst, X86_64_RBP, $1, $2, 1);
+       }
+       [reg] -> {}
+
+JIT_OP_COPY_LOAD_SHORT, JIT_OP_COPY_LOAD_USHORT, JIT_OP_COPY_STORE_SHORT: copy
+       [=local, imm] -> {
+               x86_64_mov_membase_imm_size(inst, X86_64_RBP, $1, $2, 2);
+       }
+       [=local, reg] -> {
+               x86_64_mov_membase_reg_size(inst, X86_64_RBP, $1, $2, 2);
+       }
+       [reg] -> {}
+
+JIT_OP_COPY_INT: copy
+       [=local, imm] -> {
+               x86_64_mov_membase_imm_size(inst, X86_64_RBP, $1, $2, 4);
+       }
+       [reg] -> {}
+
+JIT_OP_COPY_LONG: copy
+       [reg] -> {}
+
+JIT_OP_COPY_FLOAT32: copy
+       [=local, xreg] -> {
+               x86_64_movss_membase_reg(inst, X86_64_RBP, $1, $2);
+       }
+       [xreg] -> {}
+
+JIT_OP_COPY_FLOAT64: copy
+       [=local, xreg] -> {
+               x86_64_movsd_membase_reg(inst, X86_64_RBP, $1, $2);
+       }
+       [xreg] -> {}
+
+JIT_OP_COPY_NFLOAT: copy, stack
+       [freg] -> {}
+
+JIT_OP_COPY_STRUCT:
+       [=frame, frame, scratch reg, scratch xreg,
+               if("jit_type_get_size(jit_value_get_type(insn->dest)) <= 
_JIT_MAX_MEMCPY_INLINE")] -> {
+               inst = small_struct_copy(gen, inst, X86_64_RBP, $1, X86_64_RBP, 
$2,
+                                                                
jit_value_get_type(insn->dest), $3, $4);
+       }
+       [=frame, frame, clobber(creg), clobber(xreg)] -> {
+               inst = memory_copy(gen, inst, X86_64_RBP, $1, X86_64_RBP, $2,
+                                  
jit_type_get_size(jit_value_get_type(insn->dest)));
+       }
+
+JIT_OP_ADDRESS_OF:
+       [=reg, frame] -> {
+               x86_64_lea_membase_size(inst, $1, X86_64_RBP, $2, 8);
+       }
+
+/*
+ * Stack pushes and pops.
+ */
+
+JIT_OP_PUSH_INT: note
+       [imm] -> {
+               x86_64_push_imm(inst, $1);
+               gen->stack_changed = 1;
+       }
+       [local] -> {
+               x86_64_push_membase_size(inst, X86_64_RBP, $1, 4);
+               gen->stack_changed = 1;
+       }
+       [reg] -> {
+               x86_64_push_reg_size(inst, $1, 4);
+               gen->stack_changed = 1;
+       }
+
+JIT_OP_PUSH_LONG: note
+       [imm] -> {
+               if(($1 >= jit_min_int) && ($1 <= jit_max_int))
+               {
+                       x86_64_push_imm(inst, $1);
+               }
+               else
+               {
+                       jit_int *ptr = (jit_int *)&($1);
+                       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+                       x86_64_mov_membase_imm_size(inst, X86_64_RSP, 4, 
ptr[1], 4);
+                       x86_64_mov_membase_imm_size(inst, X86_64_RSP, 0, 
ptr[0], 4);
+               }
+               gen->stack_changed = 1;
+       }
+       [local] -> {
+               x86_64_push_membase_size(inst, X86_64_RBP, $1, 8);
+               gen->stack_changed = 1;
+       }
+       [reg] -> {
+               x86_64_push_reg_size(inst, $1, 8);
+               gen->stack_changed = 1;
+       }
+
+JIT_OP_PUSH_FLOAT32: note, stack
+       [imm] -> {
+               jit_int *ptr = (jit_int *)($1);
+               x86_64_push_imm_size(inst, ptr[0], 4);
+               gen->stack_changed = 1;
+       }
+       [local] -> {
+               x86_64_push_membase_size(inst, X86_64_RBP, $1, 4);
+               gen->stack_changed = 1;
+       }
+       [xreg] -> {
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_movss_membase_reg(inst, X86_64_RSP, 0, $1);
+               gen->stack_changed = 1;
+       }
+       [freg] -> {
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_fstp_membase_size(inst, X86_64_RSP, 0, 4);
+               gen->stack_changed = 1;
+       }
+
+JIT_OP_PUSH_FLOAT64: note, stack
+       [imm] -> {
+               jit_int *ptr = (jit_int *)($1);
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_mov_membase_imm_size(inst, X86_64_RSP, 4, ptr[1], 4);
+               x86_64_mov_membase_imm_size(inst, X86_64_RSP, 0, ptr[0], 4);
+               gen->stack_changed = 1;
+       }
+       [local] -> {
+               x86_64_push_membase_size(inst, X86_64_RBP, $1, 8);
+               gen->stack_changed = 1;
+       }
+       [xreg] -> {
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_movsd_membase_reg(inst, X86_64_RSP, 0, $1);
+               gen->stack_changed = 1;
+       }
+       [freg] -> {
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_fstp_membase_size(inst, X86_64_RSP, 0, 8);
+               gen->stack_changed = 1;
+       }
+
+JIT_OP_PUSH_NFLOAT: note, stack
+       [imm] -> {
+               jit_int *ptr = (jit_int *)($1);
+               if(sizeof(jit_nfloat) != sizeof(jit_float64))
+               {
+                       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8);
+                       x86_64_mov_membase_imm_size(inst, X86_64_RSP, 8, 
ptr[2], 4);
+               }
+               else
+               {
+                       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 
sizeof(jit_float64), 8);
+               }
+               x86_64_mov_membase_imm_size(inst, X86_64_RSP, 4, ptr[1], 4);
+               x86_64_mov_membase_imm_size(inst, X86_64_RSP, 0, ptr[0], 4);
+               gen->stack_changed = 1;
+       }
+       [local, scratch reg] -> {
+               if(sizeof(jit_nfloat) != sizeof(jit_float64))
+               {
+                       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8);
+                       x86_64_mov_reg_membase_size(inst, $2, X86_64_RBP, $1 + 
8, 4);
+                       x86_64_mov_membase_reg_size(inst, X86_64_RSP, 8, $2, 4);
+               }
+               else
+               {
+                       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               }
+               x86_64_mov_reg_membase_size(inst, $2, X86_64_RBP, $1, 8);
+               x86_64_mov_membase_reg_size(inst, X86_64_RSP, 0, $2, 8);
+               gen->stack_changed = 1;
+       }
+       [freg] -> {
+               if(sizeof(jit_nfloat) != sizeof(jit_float64))
+               {
+                       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8);
+                       x86_64_fstp_membase_size(inst, X86_64_RSP, 0, 10);
+               }
+               else
+               {
+                       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 
sizeof(jit_float64), 8);
+                       x86_64_fstp_membase_size(inst, X86_64_RSP, 0, 8);
+               }
+               gen->stack_changed = 1;
+       }
+
+JIT_OP_PUSH_STRUCT: note, more_space
+       [reg, if("((jit_nuint)jit_value_get_nint_constant(insn->value2)) <= 
32")] -> {
+               jit_nuint size;
+               jit_nuint last_part;
+               size = (jit_nuint)jit_value_get_nint_constant(insn->value2);
+               last_part = size & 0x7;
+               if(last_part)
+               {
+                       /* Handle the possible last part smaller than 8 bytes */
+                       size -= last_part;
+
+                       /* We don't care about the last not needed bytes */
+                       x86_64_push_membase_size(inst, $1, size, 8);
+               }
+               /* Handle full multiple pointer sized parts */
+               while(size > 0)
+               {
+                       size -= sizeof(void *);
+                       x86_64_push_membase_size(inst, $1, size, 8);
+               }
+               gen->stack_changed = 1;
+       }
+       [reg, clobber(creg), clobber(xreg)] -> {
+               /* Handle arbitrary-sized structures */
+               jit_nuint size;
+               size = (jit_nuint)jit_value_get_nint_constant(insn->value2);
+               /* TODO: Maybe we should check for sizes > 2GB? */
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, ROUND_STACK(size), 8);
+               inst = memory_copy(gen, inst, X86_64_RSP, 0, $1, 0, size);
+               gen->stack_changed = 1;
+       }
+
+JIT_OP_POP_STACK:
+       [] -> {
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, 
insn->value1->address, 8);
+               gen->stack_changed = 1;
+       }
+
+JIT_OP_FLUSH_SMALL_STRUCT:
+       [] -> {
+               inst = flush_return_struct(inst, insn->value1);
+       }
+
+JIT_OP_RETURN:
+       [] -> {
+               inst = jump_to_epilog(gen, inst, block);
+       }
+
+JIT_OP_RETURN_REG: manual
+        [] -> {
+               /* Nothing to do here */;
+       }
+
+JIT_OP_RETURN_INT: note
+       [reg("rax")] -> {
+               inst = jump_to_epilog(gen, inst, block);
+       }
+
+JIT_OP_RETURN_LONG: note
+       [reg("rax")] -> {
+               inst = jump_to_epilog(gen, inst, block);
+       }
+
+JIT_OP_RETURN_FLOAT32: note
+       [xreg("xmm0")] -> {
+               inst = jump_to_epilog(gen, inst, block);
+       }
+
+JIT_OP_RETURN_FLOAT64: note
+       [xreg("xmm0")] -> {
+               inst = jump_to_epilog(gen, inst, block);
+       }
+
+JIT_OP_RETURN_NFLOAT: note, stack
+       [freg, clobber(freg)] -> {
+               /* clobber(freg) frees all registers on the fp stack */
+               inst = jump_to_epilog(gen, inst, block);
+       }
+
+JIT_OP_RETURN_SMALL_STRUCT: note
+       [rreg, imm] -> {
+               inst = return_struct(inst, func, $1);
+               inst = jump_to_epilog(gen, inst, block);
+       }
+
+/*
+ * Pointer-relative loads and stores.
+ */
+
+JIT_OP_LOAD_RELATIVE_SBYTE:
+       [=reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_movsx8_reg_regp_size(inst, $1, $2, 8);
+               }
+               else
+               {
+                       x86_64_movsx8_reg_membase_size(inst, $1, $2, $3, 8);
+               }
+       }
+
+JIT_OP_LOAD_RELATIVE_UBYTE:
+       [=reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_movzx8_reg_regp_size(inst, $1, $2, 8);
+               }
+               else
+               {
+                       x86_64_movzx8_reg_membase_size(inst, $1, $2, $3, 8);
+               }
+       }
+
+JIT_OP_LOAD_RELATIVE_SHORT:
+       [=reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_movsx16_reg_regp_size(inst, $1, $2, 8);
+               }
+               else
+               {
+                       x86_64_movsx16_reg_membase_size(inst, $1, $2, $3, 8);
+               }
+       }
+
+JIT_OP_LOAD_RELATIVE_USHORT:
+       [=reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_movzx16_reg_regp_size(inst, $1, $2, 8);
+               }
+               else
+               {
+                       x86_64_movzx16_reg_membase_size(inst, $1, $2, $3, 8);
+               }
+       }
+
+JIT_OP_LOAD_RELATIVE_INT:
+       [=reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_reg_regp_size(inst, $1, $2, 4);
+               }
+               else
+               {
+                       x86_64_mov_reg_membase_size(inst, $1, $2, $3, 4);
+               }
+       }
+
+JIT_OP_LOAD_RELATIVE_LONG:
+       [=reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_reg_regp_size(inst, $1, $2, 8);
+               }
+               else
+               {
+                       x86_64_mov_reg_membase_size(inst, $1, $2, $3, 8);
+               }
+       }
+
+JIT_OP_LOAD_RELATIVE_FLOAT32:
+       [=xreg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_movss_reg_regp(inst, $1, $2);
+               }
+               else
+               {
+                       x86_64_movss_reg_membase(inst, $1, $2, $3);
+               }
+       }
+
+JIT_OP_LOAD_RELATIVE_FLOAT64:
+       [=xreg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_movsd_reg_regp(inst, $1, $2);
+               }
+               else
+               {               
+                       x86_64_movsd_reg_membase(inst, $1, $2, $3);
+               }
+       }
+       
+JIT_OP_LOAD_RELATIVE_NFLOAT:
+       [=freg, reg, imm, if("sizeof(jit_nfloat) != sizeof(jit_float64)")] -> {
+               x86_64_fld_membase_size(inst, $2, $3, 10);
+       }
+       [=freg, reg, imm, if("sizeof(jit_nfloat) == sizeof(jit_float64)")] -> {
+               x86_64_fld_membase_size(inst, $2, $3, 8);
+       }
+
+JIT_OP_LOAD_RELATIVE_STRUCT: more_space
+       [=frame, reg, imm, scratch reg, scratch xreg,
+               if("jit_type_get_size(jit_value_get_type(insn->dest)) <= 
_JIT_MAX_MEMCPY_INLINE")] -> {
+               inst = small_struct_copy(gen, inst, X86_64_RBP, $1, $2, $3,
+                                                                
jit_value_get_type(insn->dest), $4, $5);
+       }
+       [=frame, reg, imm, clobber(creg), clobber(xreg)] -> {
+               inst = memory_copy(gen, inst, X86_EBP, $1, $2, $3,
+                                  
jit_type_get_size(jit_value_get_type(insn->dest)));
+       }
+
+JIT_OP_STORE_RELATIVE_BYTE: ternary
+       [reg, imm, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_imm_size(inst, $1, $2, 1);
+               }
+               else
+               {
+                       x86_64_mov_membase_imm_size(inst, $1, $3, $2, 1);
+               }
+       }
+       [reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_reg_size(inst, $1, $2, 1);
+               }
+               else
+               {
+                       x86_64_mov_membase_reg_size(inst, $1, $3, $2, 1);
+               }
+       }
+
+JIT_OP_STORE_RELATIVE_SHORT: ternary
+       [reg, imm, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_imm_size(inst, $1, $2, 2);
+               }
+               else
+               {
+                       x86_64_mov_membase_imm_size(inst, $1, $3, $2, 2);
+               }
+       }
+       [reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_reg_size(inst, $1, $2, 2);
+               }
+               else
+               {
+                       x86_64_mov_membase_reg_size(inst, $1, $3, $2, 2);
+               }
+       }
+
+JIT_OP_STORE_RELATIVE_INT: ternary
+       [reg, imm, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_imm_size(inst, $1, $2, 4);
+               }
+               else
+               {
+                       x86_64_mov_membase_imm_size(inst, $1, $3, $2, 4);
+               }
+       }
+       [reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_reg_size(inst, $1, $2, 4);
+               }
+               else
+               {
+                       x86_64_mov_membase_reg_size(inst, $1, $3, $2, 4);
+               }
+       }
+
+JIT_OP_STORE_RELATIVE_LONG: ternary
+       [reg, imm, imm, if("$2 >= jit_min_int && $2 <= jit_max_int")] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_imm_size(inst, $1, $2, 8);
+               }
+               else
+               {
+                       x86_64_mov_membase_imm_size(inst, $1, $3, $2, 8);
+               }
+       }
+       [reg, reg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_reg_size(inst, $1, $2, 8);
+               }
+               else
+               {
+                       x86_64_mov_membase_reg_size(inst, $1, $3, $2, 8);
+               }
+       }
+
+JIT_OP_STORE_RELATIVE_FLOAT32: ternary
+       [reg, imm, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_mov_regp_imm_size(inst, $1, ((jit_int 
*)($2))[0], 4);
+               }
+               else
+               {
+                       x86_64_mov_membase_imm_size(inst, $1, $3, ((jit_int 
*)($2))[0], 4);
+               }
+       }
+       [reg, xreg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_movss_regp_reg(inst, $1, $2);
+               }
+               else
+               {       
+                       x86_64_movss_membase_reg(inst, $1, $3, $2);
+               }
+       }
+
+JIT_OP_STORE_RELATIVE_FLOAT64: ternary
+       [reg, imm, imm] -> {
+               x86_64_mov_membase_imm_size(inst, $1, $3, ((int *)($2))[0], 4);
+               x86_64_mov_membase_imm_size(inst, $1, $3 + 4, ((int *)($2))[1], 
4);
+       }
+       [reg, xreg, imm] -> {
+               if($3 == 0)
+               {
+                       x86_64_movsd_regp_reg(inst, $1, $2);
+               }
+               else
+               {       
+                       x86_64_movsd_membase_reg(inst, $1, $3, $2);
+               }
+       }
+
+JIT_OP_STORE_RELATIVE_STRUCT: ternary
+       [reg, frame, imm, scratch reg, scratch xreg,
+               if("jit_type_get_size(jit_value_get_type(insn->value1)) <= 
_JIT_MAX_MEMCPY_INLINE")] -> {
+               inst = small_struct_copy(gen, inst, $1, $3, X86_64_RBP, $2,
+                                                                
jit_value_get_type(insn->value1), $4, $5);
+       }
+       [reg, frame, imm, clobber(creg), clobber(xreg)] -> {
+               inst = memory_copy(gen, inst, $1, $3, X86_64_RBP, $2,
+                                  
jit_type_get_size(jit_value_get_type(insn->value1)));
+       }
+
+JIT_OP_ADD_RELATIVE:
+       [reg, imm, if("$2 >= jit_min_int && $2 <= jit_max_int")] -> {
+               if(insn->value2->address != 0)
+               {
+                       x86_64_add_reg_imm_size(inst, $1, $2, 8);
+               }
+       }
+
+/*
+ * Array element loads and stores.
+ */
+
+JIT_OP_LOAD_ELEMENT_SBYTE:
+       [=reg, reg, reg] -> {
+               x86_64_movsx8_reg_memindex_size(inst, $1, $2, 0, $3, 0, 4);
+       }
+
+JIT_OP_LOAD_ELEMENT_UBYTE:
+       [=reg, reg, reg] -> {
+               x86_64_movzx8_reg_memindex_size(inst, $1, $2, 0, $3, 0, 4);
+       }
+
+JIT_OP_LOAD_ELEMENT_SHORT:
+       [=reg, reg, reg] -> {
+               x86_64_movsx16_reg_memindex_size(inst, $1, $2, 0, $3, 1, 4);
+       }
+
+JIT_OP_LOAD_ELEMENT_USHORT:
+       [=reg, reg, reg] -> {
+               x86_64_movzx16_reg_memindex_size(inst, $1, $2, 0, $3, 1, 4);
+       }
+
+JIT_OP_LOAD_ELEMENT_INT:
+       [=reg, reg, reg] -> {
+               x86_64_mov_reg_memindex_size(inst, $1, $2, 0, $3, 2, 4);
+       }
+
+JIT_OP_LOAD_ELEMENT_LONG:
+       [=reg, reg, reg] -> {
+               x86_64_mov_reg_memindex_size(inst, $1, $2, 0, $3, 3, 8);
+       }
+
+JIT_OP_LOAD_ELEMENT_FLOAT32:
+       [=xreg, reg, reg] -> {
+               x86_64_movss_reg_memindex(inst, $1, $2, 0, $3, 2);
+       }
+
+JIT_OP_LOAD_ELEMENT_FLOAT64:
+       [=xreg, reg, reg] -> {
+               x86_64_movsd_reg_memindex(inst, $1, $2, 0, $3, 3);
+       }
+
+JIT_OP_STORE_ELEMENT_BYTE: ternary
+       [reg, reg, reg] -> {
+               x86_64_mov_memindex_reg_size(inst, $1, 0, $2, 0, $3, 1);
+       }
+
+JIT_OP_STORE_ELEMENT_SHORT: ternary
+       [reg, reg, reg] -> {
+               x86_64_mov_memindex_reg_size(inst, $1, 0, $2, 1, $3, 2);
+       }
+
+JIT_OP_STORE_ELEMENT_INT: ternary
+       [reg, reg, reg] -> {
+               x86_64_mov_memindex_reg_size(inst, $1, 0, $2, 2, $3, 4);
+       }
+
+JIT_OP_STORE_ELEMENT_LONG: ternary
+       [reg, reg, imm] -> {
+               if($3 >= jit_min_int && $3 <= jit_max_int)
+               {
+                       x86_64_mov_memindex_imm_size(inst, $1, 0, $2, 3, $3, 8);
+               }
+               else
+               {
+                       jit_int *long_ptr = (jit_int *)(&($3));
+
+                       x86_64_mov_memindex_imm_size(inst, $1, 0, $2, 3, 
long_ptr[0], 4);
+                       x86_64_mov_memindex_imm_size(inst, $1, 4, $2, 3, 
long_ptr[1], 4);
+               }
+       }
+       [reg, reg, reg] -> {
+               x86_64_mov_memindex_reg_size(inst, $1, 0, $2, 3, $3, 8);
+       }
+
+JIT_OP_STORE_ELEMENT_FLOAT32: ternary
+       [reg, reg, xreg] -> {
+               x86_64_movss_memindex_reg(inst, $1, 0, $2, 2, $3);
+       }
+
+JIT_OP_STORE_ELEMENT_FLOAT64: ternary
+       [reg, reg, xreg] -> {
+               x86_64_movsd_memindex_reg(inst, $1, 0, $2, 3, $3);
+       }
+
+/*
+ * Arithmetic opcodes.
+ */
+
+/*
+ * 4 byte integer versions
+ */
+
+JIT_OP_IADD: commutative
+       [reg, imm] -> {
+               if($2 == 1)
+               {
+                       x86_64_inc_reg_size(inst, $1, 4);
+               }
+               else
+               {
+                       x86_64_add_reg_imm_size(inst, $1, $2, 4);
+               }
+       }
+       [reg, local] -> {
+               x86_64_add_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+       }
+       [reg, reg] -> {
+               x86_64_add_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_ISUB:
+       [reg, imm] -> {
+               if($2 == 1)
+               {
+                       x86_64_dec_reg_size(inst, $1, 4);
+               }
+               else
+               {
+                       x86_64_sub_reg_imm_size(inst, $1, $2, 4);
+               }
+       }
+       [reg, local] -> {
+               x86_64_sub_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+       }
+       [reg, reg] -> {
+               x86_64_sub_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_INEG:
+       [reg] -> {
+               x86_64_neg_reg_size(inst, $1, 4);
+       }
+
+/*
+ * 8 byte integer versions
+ */
+
+JIT_OP_LADD: commutative
+       [reg, imm, if("$2 >= jit_min_int && $2 <= jit_max_int")] -> {
+               if($2 == 1)
+               {
+                       x86_64_inc_reg_size(inst, $1, 8);
+               }
+               else
+               {
+                       x86_64_add_reg_imm_size(inst, $1, $2, 8);
+               }
+       }
+       [reg, local] -> {
+               x86_64_add_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+       }
+       [reg, reg] -> {
+               x86_64_add_reg_reg_size(inst, $1, $2, 8);
+       }
+
+JIT_OP_LSUB:
+       [reg, imm, if("$2 >= jit_min_int && $2 <= jit_max_int")] -> {
+               if($2 == 1)
+               {
+                       x86_64_dec_reg_size(inst, $1, 8);
+               }
+               else
+               {
+                       x86_64_sub_reg_imm_size(inst, $1, $2, 8);
+               }
+       }
+       [reg, local] -> {
+               x86_64_sub_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+       }
+       [reg, reg] -> {
+               x86_64_sub_reg_reg_size(inst, $1, $2, 8);
+       }
+
+JIT_OP_LNEG:
+       [reg] -> {
+               x86_64_neg_reg_size(inst, $1, 8);
+       }
+/*
+ * single precision float versions
+ */
+
+JIT_OP_FADD:
+       [xreg, imm] -> {
+               _jit_xmm1_reg_imm_size_float32(gen, &inst, XMM1_ADD, $1, 
(jit_float32 *)$2);
+       }
+       [xreg, local] -> {
+               x86_64_addss_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_addss_reg_reg(inst, $1, $2);
+       }
+
+JIT_OP_FSUB:
+       [xreg, imm] -> {
+               _jit_xmm1_reg_imm_size_float32(gen, &inst, XMM1_SUB, $1, 
(jit_float32 *)$2);
+       }
+       [xreg, xreg] -> {
+               x86_64_subss_reg_reg(inst, $1, $2);
+       }
+       [xreg, local] -> {
+               x86_64_subss_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+
+JIT_OP_FMUL:
+       [xreg, imm] -> {
+               _jit_xmm1_reg_imm_size_float32(gen, &inst, XMM1_MUL, $1, 
(jit_float32 *)$2);
+       }
+       [xreg, xreg] -> {
+               x86_64_mulss_reg_reg(inst, $1, $2);
+       }
+       [xreg, local] -> {
+               x86_64_mulss_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+
+JIT_OP_FDIV:
+       [xreg, imm] -> {
+               _jit_xmm1_reg_imm_size_float32(gen, &inst, XMM1_DIV, $1, 
(jit_float32 *)$2);
+       }
+       [xreg, xreg] -> {
+               x86_64_divss_reg_reg(inst, $1, $2);
+       }
+       [xreg, local] -> {
+               x86_64_divss_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+
+/*
+ * double precision float versions
+ */
+
+JIT_OP_DADD:
+       [xreg, imm] -> {
+               _jit_xmm1_reg_imm_size_float64(gen, &inst, XMM1_ADD, $1, 
(jit_float64 *)$2);
+       }
+       [xreg, local] -> {
+               x86_64_addsd_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_addsd_reg_reg(inst, $1, $2);
+       }
+
+JIT_OP_DSUB:
+       [xreg, imm] -> {
+               _jit_xmm1_reg_imm_size_float64(gen, &inst, XMM1_SUB, $1, 
(jit_float64 *)$2);
+       }
+       [xreg, local] -> {
+               x86_64_subsd_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_subsd_reg_reg(inst, $1, $2);
+       }
+
+JIT_OP_DMUL:
+       [xreg, imm] -> {
+               _jit_xmm1_reg_imm_size_float64(gen, &inst, XMM1_MUL, $1, 
(jit_float64 *)$2);
+       }
+       [xreg, local] -> {
+               x86_64_mulsd_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_mulsd_reg_reg(inst, $1, $2);
+       }
+
+JIT_OP_DDIV:
+       [xreg, imm] -> {
+               _jit_xmm1_reg_imm_size_float64(gen, &inst, XMM1_DIV, $1, 
(jit_float64 *)$2);
+       }
+       [xreg, local] -> {
+               x86_64_divsd_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_divsd_reg_reg(inst, $1, $2);
+       }
+
+/*
+ * Bitwise opcodes.
+ */
+
+JIT_OP_IAND: commutative
+       [reg, imm] -> {
+               x86_64_and_reg_imm_size(inst, $1, $2, 4);
+       }
+       [reg, local] -> {
+               x86_64_and_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+       }
+       [reg, reg] -> {
+               x86_64_and_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_IOR: commutative
+       [reg, imm] -> {
+               x86_64_or_reg_imm_size(inst, $1, $2, 4);
+       }
+       [reg, local] -> {
+               x86_64_or_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+       }
+       [reg, reg] -> {
+               x86_64_or_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_IXOR: commutative
+       [reg, imm] -> {
+               x86_64_xor_reg_imm_size(inst, $1, $2, 4);
+       }
+       [reg, local] -> {
+               x86_64_xor_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+       }
+       [reg, reg] -> {
+               x86_64_xor_reg_reg_size(inst, $1, $2, 4);
+       }
+
+JIT_OP_INOT:
+       [reg] -> {
+               x86_64_not_reg_size(inst, $1, 4);
+       }
+
+JIT_OP_LAND: commutative
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_and_reg_imm_size(inst, $1, $2, 8);
+       }
+       [reg, local] -> {
+               x86_64_and_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+       }
+       [reg, reg] -> {
+               x86_64_and_reg_reg_size(inst, $1, $2, 8);
+       }
+
+JIT_OP_LOR: commutative
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_or_reg_imm_size(inst, $1, $2, 8);
+       }
+       [reg, local] -> {
+               x86_64_or_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+       }
+       [reg, reg] -> {
+               x86_64_or_reg_reg_size(inst, $1, $2, 8);
+       }
+
+JIT_OP_LXOR: commutative
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_xor_reg_imm_size(inst, $1, $2, 8);
+       }
+       [reg, local] -> {
+               x86_64_xor_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+       }
+       [reg, reg] -> {
+               x86_64_xor_reg_reg_size(inst, $1, $2, 8);
+       }
+
+JIT_OP_LNOT:
+       [reg] -> {
+               x86_64_not_reg_size(inst, $1, 8);
+       }
+
+
+
+/*
+ * Branch opcodes.
+ */
+
+JIT_OP_BR: branch
+       [] -> {
+               inst = output_branch(func, inst, 0xEB /* jmp */, insn);
+       }
+
+JIT_OP_BR_IFALSE: branch
+       [reg] -> {
+               x86_64_or_reg_reg_size(inst, $1, $1, 4);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+
+JIT_OP_BR_ITRUE: branch
+       [reg] -> {
+               x86_64_or_reg_reg_size(inst, $1, $1, 4);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+
+JIT_OP_BR_IEQ: branch
+       [reg, immzero] -> {
+               x86_64_or_reg_reg_size(inst, $1, $1, 4);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+
+JIT_OP_BR_INE: branch
+       [reg, immzero] -> {
+               x86_64_or_reg_reg_size(inst, $1, $1, 4);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+
+JIT_OP_BR_ILT: branch
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x7C /* lt */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x7C /* lt */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x7C /* lt */, insn);
+       }
+
+JIT_OP_BR_ILT_UN: branch
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+
+JIT_OP_BR_ILE: branch
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x7E /* le */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x7E /* le */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x7E /* le */, insn);
+       }
+
+JIT_OP_BR_ILE_UN: branch
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+
+JIT_OP_BR_IGT: branch
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x7F /* gt */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x7F /* gt */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x7F /* gt */, insn);
+       }
+
+JIT_OP_BR_IGT_UN: branch
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+
+JIT_OP_BR_IGE: branch
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x7D /* ge */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x7D /* ge */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x7D /* ge */, insn);
+       }
+
+JIT_OP_BR_IGE_UN: branch
+       [reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 4);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+
+JIT_OP_BR_LFALSE: branch
+       [reg] -> {
+               x86_64_or_reg_reg_size(inst, $1, $1, 8);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+
+JIT_OP_BR_LTRUE: branch
+       [reg] -> {
+               x86_64_or_reg_reg_size(inst, $1, $1, 8);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+
+JIT_OP_BR_LEQ: branch
+       [reg, immzero] -> {
+               x86_64_or_reg_reg_size(inst, $1, $1, 8);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+
+JIT_OP_BR_LNE: branch
+       [reg, immzero] -> {
+               x86_64_or_reg_reg_size(inst, $1, $1, 8);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+
+JIT_OP_BR_LLT: branch
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x7C /* lt */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x7C /* lt */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x7C /* lt */, insn);
+       }
+
+JIT_OP_BR_LLT_UN: branch
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+
+JIT_OP_BR_LLE: branch
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x7E /* le */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x7E /* le */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x7E /* le */, insn);
+       }
+
+JIT_OP_BR_LLE_UN: branch
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+
+JIT_OP_BR_LGT: branch
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x7F /* gt */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x7F /* gt */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x7F /* gt */, insn);
+       }
+
+JIT_OP_BR_LGT_UN: branch
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+
+JIT_OP_BR_LGE: branch
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x7D /* ge */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x7D /* ge */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x7D /* ge */, insn);
+       }
+
+JIT_OP_BR_LGE_UN: branch
+       [reg, imm, if("($2 >= jit_min_int && $2 <= jit_max_int)")] -> {
+               x86_64_cmp_reg_imm_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+       [reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $1, X86_64_RBP, $2, 8);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+
+/*
+ * Comparison opcodes.
+ */
+
+JIT_OP_IEQ:
+       [=reg, reg, immzero] -> {
+               x86_64_or_reg_reg_size(inst, $2, $2, 4);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+
+JIT_OP_INE:
+       [=reg, reg, immzero] -> {
+               x86_64_or_reg_reg_size(inst, $2, $2, 4);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+
+JIT_OP_ILT:
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 1);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 1);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 1);
+       }
+
+JIT_OP_ILT_UN:
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 0);
+       }
+
+JIT_OP_ILE:
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 1);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 1);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 1);
+       }
+
+JIT_OP_ILE_UN:
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 0);
+       }
+
+JIT_OP_IGT:
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 1);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 1);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 1);
+       }
+
+JIT_OP_IGT_UN:
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 0);
+       }
+
+JIT_OP_IGE:
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 1);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 1);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 1);
+       }
+
+JIT_OP_IGE_UN:
+       [=reg, reg, imm] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 4);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 0);
+       }
+
+JIT_OP_LEQ:
+       [=reg, reg, immzero] -> {
+               x86_64_or_reg_reg_size(inst, $2, $2, 8);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+
+JIT_OP_LNE:
+       [=reg, reg, immzero] -> {
+               x86_64_or_reg_reg_size(inst, $2, $2, 8);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+
+JIT_OP_LLT:
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 1);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 1);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 1);
+       }
+
+JIT_OP_LLT_UN:
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LT, 0);
+       }
+
+JIT_OP_LLE:
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 1);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 1);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 1);
+       }
+
+JIT_OP_LLE_UN:
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_LE, 0);
+       }
+
+JIT_OP_LGT:
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 1);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 1);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 1);
+       }
+
+JIT_OP_LGT_UN:
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GT, 0);
+       }
+
+JIT_OP_LGE:
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 1);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 1);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 1);
+       }
+
+JIT_OP_LGE_UN:
+       [=reg, reg, imm, if("$3 >= jit_min_int && $3 <= jit_max_int")] -> {
+               x86_64_cmp_reg_imm_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 0);
+       }
+       [=reg, reg, local] -> {
+               x86_64_cmp_reg_membase_size(inst, $2, X86_64_RBP, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 0);
+       }
+       [=reg, reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $2, $3, 8);
+               inst = setcc_reg(inst, $1, X86_CC_GE, 0);
+       }
+
+/*
+ * Pointer check opcodes.
+ */
+
+JIT_OP_CHECK_NULL: note
+       [reg] -> {
+#if 0 && defined(JIT_USE_SIGNALS)
+               /* if $1 contains NULL this generates SEGV and the signal
+                  handler will throw the exception  */
+               x86_64_cmp_reg_membase_size(inst, $1, $1, 0, 8);
+#else
+               unsigned char *patch;
+               x86_64_or_reg_reg_size(inst, $1, $1, 8);
+               patch = inst;
+               x86_branch8(inst, X86_CC_NE, 0, 0);
+               inst = throw_builtin(inst, func, JIT_RESULT_NULL_REFERENCE);
+               x86_patch(patch, inst);
+#endif
+       }
+
+/*
+ * Function calls.
+ */
+
+JIT_OP_CALL:
+       [] -> {
+               jit_function_t func = (jit_function_t)(insn->dest);
+               inst = x86_64_call_code(inst, 
(jit_nint)jit_function_to_closure(func));
+       }
+
+JIT_OP_CALL_TAIL:
+       [] -> {
+               jit_function_t func = (jit_function_t)(insn->dest);
+               x86_64_mov_reg_reg_size(inst, X86_64_RSP, X86_64_RBP, 8);
+               x86_64_pop_reg_size(inst, X86_64_RBP, 8);
+               x86_64_jump_to_code(inst, 
(jit_nint)jit_function_to_closure(func));
+       }
+
+JIT_OP_CALL_INDIRECT:
+       [] -> {
+               x86_64_call_reg(inst, X86_64_SCRATCH);
+       }
+
+JIT_OP_CALL_INDIRECT_TAIL:
+       [] -> {
+               x86_64_mov_reg_reg_size(inst, X86_64_RSP, X86_64_RBP, 8);
+               x86_64_pop_reg_size(inst, X86_64_RBP, 8);
+               x86_64_jmp_reg(inst, X86_64_SCRATCH);
+       }
+
+JIT_OP_CALL_VTABLE_PTR:
+       [] -> {
+               x86_64_call_reg(inst, X86_64_SCRATCH);
+       }
+
+JIT_OP_CALL_VTABLE_PTR_TAIL:
+       [] -> {
+               x86_64_mov_reg_reg_size(inst, X86_64_RSP, X86_64_RBP, 8);
+               x86_64_pop_reg_size(inst, X86_64_RBP, 8);
+               x86_64_jmp_reg(inst, X86_64_SCRATCH);
+       }
+
+JIT_OP_CALL_EXTERNAL:
+       [] -> {
+               inst = x86_64_call_code(inst, (jit_nint)(insn->dest));
+       }
+
+JIT_OP_CALL_EXTERNAL_TAIL:
+       [] -> {
+               x86_64_mov_reg_reg_size(inst, X86_64_RSP, X86_64_RBP, 8);
+               x86_64_pop_reg_size(inst, X86_64_RBP, 8);
+               x86_64_jump_to_code(inst, (jit_nint)(insn->dest));
+       }
+
+
+/*
+ * Exception handling.
+ */
+
+JIT_OP_THROW: branch
+       [reg] -> {
+               x86_64_mov_reg_reg_size(inst, X86_64_RDI, $1, 8);
+               if(func->builder->setjmp_value != 0)
+               {
+                       jit_nint pc_offset;
+
+                       /* We have a "setjmp" block in the current function,
+                          so we must record the location of the throw first */
+                       _jit_gen_fix_value(func->builder->setjmp_value);
+                       pc_offset = func->builder->setjmp_value->frame_offset +
+                                                       jit_jmp_catch_pc_offset;
+
+                       x86_64_lea_membase_size(inst, X86_64_SCRATCH, 
X86_64_RIP, 0, 8);
+                       x86_64_mov_membase_reg_size(inst, X86_64_RBP, pc_offset,
+                                                                               
X86_64_SCRATCH, 8);
+               }
+               inst = x86_64_call_code(inst, (jit_nint)jit_exception_throw);
+       }
+
+JIT_OP_RETHROW: manual
+       [] -> { /* Not used in native code back ends */ }
+
+JIT_OP_LOAD_PC:
+       [=reg] -> {
+               x86_64_lea_membase_size(inst, $1, X86_64_RIP, 0, 8);
+       }
+
+JIT_OP_LOAD_EXCEPTION_PC: manual
+       [] -> { /* Not used in native code back ends */ }
+
+JIT_OP_ENTER_FINALLY:
+       [] -> {
+               /* The return address is on the stack */
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+        }
+
+JIT_OP_LEAVE_FINALLY: branch
+       [] -> {
+               /* The "finally" return address is on the stack */
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_ret(inst);
+       }
+
+JIT_OP_CALL_FINALLY: branch
+       [] -> {
+               jit_block_t block;
+
+               block = jit_block_from_label(func, (jit_label_t)(insn->dest));
+               if(!block)
+               {
+                       return;
+               }
+
+               if(block->address)
+               {
+                       inst = x86_64_call_code(inst, (jit_nint)block->address);
+               }
+               else
+               {
+                       jit_int fixup;
+
+                       if(block->fixup_list)
+                       {
+                               fixup = _JIT_CALC_FIXUP(block->fixup_list, inst 
+ 1);
+                       }
+                       else
+                       {
+                               fixup = 0;
+                       }
+                       block->fixup_list = (void *)(inst + 1);
+                       x86_64_call_imm(inst, fixup);
+               }
+       }
+
+JIT_OP_ADDRESS_OF_LABEL:
+       [=reg] -> {
+               jit_int *fixup;
+
+               block = jit_block_from_label(func, (jit_label_t)(insn->value1));
+               if(block->address)
+               {
+                       /* The label is in the current function so we assume 
that the */
+                       /* displacement to the current instruction is in the 
+-2GB range */
+
+                       x86_64_lea_membase_size(inst, $1, X86_64_RIP, 0, 8);
+                       fixup = (jit_int *)(inst - 4);
+                       fixup[0] = (jit_int)((jit_nint)block->address - 
(jit_nint)inst);
+               }
+               else
+               {
+                       /* Output a placeholder and record on the block's fixup 
list */
+                       /* The label is in the current function so we assume 
that the */
+                       /* displacement to the current instruction will be in 
the +-2GB range */
+                       x86_64_lea_membase_size(inst, $1, X86_64_RIP, 0, 8);
+                       fixup = (jit_int *)(inst - 4);
+                       if(block->fixup_list)
+                       {
+                               fixup[0] = _JIT_CALC_FIXUP(block->fixup_list, 
fixup);
+                       }
+                       block->fixup_list = (void *)fixup;
+               }
+       }
+
+/*
+ * Block operations.
+ */
+
+JIT_OP_MEMCPY: ternary
+       [any, any, imm, if("$3 <= 0")] -> { }
+       [reg, reg, imm, scratch reg, scratch xreg,
+               if("$3 <= _JIT_MAX_MEMCPY_INLINE")] -> {
+               inst = small_block_copy(gen, inst, $1, 0, $2, 0, $3, $4, $5, 0);
+       }
+       [reg, reg, imm, clobber(creg), clobber(xreg)] -> {
+               inst = memory_copy(gen, inst, $1, 0, $2, 0, $3);
+       }
+
+JIT_OP_JUMP_TABLE: ternary, branch
+       [reg, imm, imm, scratch reg, space("64")] -> {
+               unsigned char *patch_jump_table;
+               unsigned char *patch_fall_through;
+               int index;
+               jit_label_t *labels;
+               jit_nint num_labels;
+               jit_block_t block;
+
+               labels = (jit_label_t *) $2;
+               num_labels = $3;
+
+               patch_jump_table = (unsigned char 
*)_jit_cache_alloc(&(gen->posn),
+                                                                               
                                 sizeof(void *) * $3);
+               if(!patch_jump_table)
+               {
+                       /* The cache is full */
+                       return;
+               }
+
+               x86_64_mov_reg_imm_size(inst, $4, (jit_nint)patch_jump_table, 
8);
+               x86_64_cmp_reg_imm_size(inst, $1, num_labels, 8);
+               patch_fall_through = inst;
+               x86_branch32(inst, X86_CC_AE, 0, 0);
+
+               if(func->builder->position_independent)
+               {
+                       /* TODO */
+                       TODO();
+               }
+               else
+               {
+                       x86_64_jmp_memindex(inst, $4, 0, $1, 3);
+               }
+
+               for(index = 0; index < num_labels; index++)
+               {
+                       block = jit_block_from_label(func, labels[index]);
+                       if(!block)
+                       {
+                               return;
+                       }
+
+                       if(func->builder->position_independent)
+                       {
+                               /* TODO */
+                               TODO();
+                       }
+                       else
+                       {
+                               if(block->address)
+                               {
+                                       x86_64_imm_emit64(patch_jump_table, 
(jit_nint)(block->address));
+                               }
+                               else
+                               {
+                                       /* Output a placeholder and record on 
the block's absolute fixup list */
+                                       x86_64_imm_emit64(patch_jump_table, 
(jit_nint)(block->fixup_absolute_list));
+                                       block->fixup_absolute_list = (void 
*)(patch_jump_table - 8);
+                               }
+                       }
+               }
+
+               x86_patch(patch_fall_through, inst);
+       }




reply via email to

[Prev in Thread] Current Thread [Next in Thread]