dotgnu-pnet-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[dotgnu-pnet-commits] libjit ChangeLog jit/jit-gen-x86-64.h jit/jit-r...


From: Klaus Treichel
Subject: [dotgnu-pnet-commits] libjit ChangeLog jit/jit-gen-x86-64.h jit/jit-r...
Date: Sun, 13 Apr 2008 17:55:37 +0000

CVSROOT:        /cvsroot/dotgnu-pnet
Module name:    libjit
Changes by:     Klaus Treichel <ktreichel>      08/04/13 17:55:37

Modified files:
        .              : ChangeLog 
        jit            : jit-gen-x86-64.h jit-rules-x86-64.c 
                         jit-rules-x86-64.ins 

Log message:
        Add support for more opcodes on x86-64.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/libjit/ChangeLog?cvsroot=dotgnu-pnet&r1=1.360&r2=1.361
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-gen-x86-64.h?cvsroot=dotgnu-pnet&r1=1.4&r2=1.5
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-rules-x86-64.c?cvsroot=dotgnu-pnet&r1=1.2&r2=1.3
http://cvs.savannah.gnu.org/viewcvs/libjit/jit/jit-rules-x86-64.ins?cvsroot=dotgnu-pnet&r1=1.3&r2=1.4

Patches:
Index: ChangeLog
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/ChangeLog,v
retrieving revision 1.360
retrieving revision 1.361
diff -u -b -r1.360 -r1.361
--- ChangeLog   13 Apr 2008 16:14:15 -0000      1.360
+++ ChangeLog   13 Apr 2008 17:55:36 -0000      1.361
@@ -10,6 +10,20 @@
        * include/jit/jit-walk.h: use _JIT_ARCH_GET_RETURN_ADDRESS and
        _JIT_ARCH_GET_CURRENT_RETURN if available.
 
+       * jit/jit-gen-x86-64.h: Add additional macros for saving and
+       restoring the fpu controlword and the mxcsr register. Add
+       additional SSE conversion macros. Add SSE compare macros.
+       Add macros for the SSE bit operations on packed values.
+       Add macros for SSE sqrt and rounding. Add macros for fpu rounding.
+
+       * jit/jit-rules-x86-64.c: Add the dreg register class and functions
+       to handle rounding and SSE bit opcodes on packed values.
+
+       * jit/jit-rules-x86-64.ins: Add INT_TO_NFLOAT, LONG_TO_NFLOAT,
+       FLOAT32_TO_NFLOAT, FLOAT64_TO_NFLOAT.
+       Rewrite NFLOAT_TO_INT and NFLOAT_TO_LONG to use the new functions
+       in jit-rules-x86-64.c. Add handling of ABS, NEG and float compares.
+       
 2008-03-31  Klaus Treichel  <address@hidden>
 
        * jit/jit-rules-x86.ins: Fix the sign opcode for integers and the

Index: jit/jit-gen-x86-64.h
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-gen-x86-64.h,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -b -r1.4 -r1.5
--- jit/jit-gen-x86-64.h        30 Mar 2008 15:05:13 -0000      1.4
+++ jit/jit-gen-x86-64.h        13 Apr 2008 17:55:36 -0000      1.5
@@ -111,6 +111,28 @@
 } X86_64_XMM1_OP;
 
 /*
+ * Logical opcodes used with packed single and double precision values.
+ */
+typedef enum
+{
+       XMM_ANDP                = 0x54,
+       XMM_ORP                 = 0x56,
+       XMM_XORP                = 0x57
+} X86_64_XMM_PLOP;
+
+/*
+ * Rounding modes for xmm rounding instructions, the mxcsr register and
+ * the fpu control word.
+ */
+typedef enum
+{
+       X86_ROUND_NEAREST       = 0x00,         /* Round to the nearest integer 
*/
+       X86_ROUND_DOWN          = 0x01,         /* Round towards negative 
infinity */
+       X86_ROUND_UP            = 0x02,         /* Round towards positive 
infinity */
+       X86_ROUND_ZERO          = 0x03          /* Round towards zero 
(truncate) */
+} X86_64_ROUNDMODE;
+
+/*
  * Helper union for emmitting 64 bit immediate values.
  */
 typedef union
@@ -3560,6 +3582,59 @@
        } while(0)
 
 /*
+ * xmm instructions with a prefix and three opcodes
+ */
+#define x86_64_p1_xmm3_reg_reg_size(inst, p1, opc1, opc2, opc3, r, reg, size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), 0, (reg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               *(inst)++ = (unsigned char)(opc3); \
+               x86_64_reg_emit(inst, (r), (reg)); \
+       } while(0)
+
+#define x86_64_p1_xmm3_reg_regp_size(inst, p1, opc1, opc2, opc3, r, regp, 
size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), 0, (regp)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               *(inst)++ = (unsigned char)(opc3); \
+               x86_64_regp_emit(inst, (r), (regp)); \
+       } while(0)
+
+#define x86_64_p1_xmm3_reg_mem_size(inst, p1, opc1, opc2, opc3, r, mem, size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), 0, 0); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               *(inst)++ = (unsigned char)(opc3); \
+               x86_64_mem_emit(inst, (r), (mem)); \
+       } while(0)
+
+#define x86_64_p1_xmm3_reg_membase_size(inst, p1, opc1, opc2, opc3, r, 
basereg, disp, size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), 0, (basereg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               *(inst)++ = (unsigned char)(opc3); \
+               x86_64_membase_emit(inst, (r), (basereg), (disp)); \
+       } while(0)
+
+#define x86_64_p1_xmm3_reg_memindex_size(inst, p1, opc1, opc2, opc3, r, 
basereg, disp, indexreg, shift, size) \
+       do { \
+               *(inst)++ = (unsigned char)(p1); \
+               x86_64_rex_emit(inst, (size), (r), (indexreg), (basereg)); \
+               *(inst)++ = (unsigned char)(opc1); \
+               *(inst)++ = (unsigned char)(opc2); \
+               *(inst)++ = (unsigned char)(opc3); \
+               x86_64_memindex_emit((inst), (r), (basereg), (disp), 
(indexreg), (shift)); \
+       } while(0)
+
+/*
  * xmm1: Macro for use of the X86_64_XMM1 enum
  */
 #define x86_64_xmm1_reg_reg(inst, opc, dreg, sreg, is_double) \
@@ -3588,6 +3663,56 @@
        } while(0)
 
 /*
+ * Load and store MXCSR register state
+ */
+
+/*
+ * ldmxcsr: Load MXCSR register
+ */
+#define x86_64_ldmxcsr_regp(inst, sregp) \
+       do { \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0xae, 2, (sregp)); \
+       } while(0)
+
+#define x86_64_ldmxcsr_mem(inst, mem) \
+       do { \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0xae, 2, (mem)); \
+       } while(0)
+
+#define x86_64_ldmxcsr_membase(inst, basereg, disp) \
+       do { \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0xae, 2, (basereg), 
(disp)); \
+       } while(0)
+
+#define x86_64_ldmxcsr_memindex(inst, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0xae, 2, (basereg), 
(disp), (indexreg), (shift)); \
+       } while(0)
+
+/*
+ * stmxcsr: Store MXCSR register
+ */
+#define x86_64_stmxcsr_regp(inst, sregp) \
+       do { \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0xae, 3, (sregp)); \
+       } while(0)
+
+#define x86_64_stmxcsr_mem(inst, mem) \
+       do { \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0xae, 3, (mem)); \
+       } while(0)
+
+#define x86_64_stmxcsr_membase(inst, basereg, disp) \
+       do { \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0xae, 3, (basereg), 
(disp)); \
+       } while(0)
+
+#define x86_64_stmxcsr_memindex(inst, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0xae, 3, (basereg), 
(disp), (indexreg), (shift)); \
+       } while(0)
+
+/*
  * Move instructions
  */
 
@@ -3952,6 +4077,66 @@
        } while(0)
 
 /*
+ * cvtss2si: Convert float32.to a signed integer using the rounding mode
+ * in the mxcsr register
+ * The size is the size of the integer value (4 or 8)
+ */
+#define x86_64_cvtss2si_reg_reg_size(inst, dreg, sxreg, size) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x2d, (dreg), 
(sxreg), (size)); \
+       } while(0)
+
+#define x86_64_cvtss2si_reg_regp_size(inst, dreg, sregp, size) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x2d, (dreg), 
(sregp), (size)); \
+       } while(0)
+
+#define x86_64_cvtss2si_reg_mem_size(inst, dreg, mem, size) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x2d, (dreg), 
(mem), (size)); \
+       } while(0)
+
+#define x86_64_cvtss2si_reg_membase_size(inst, dreg, basereg, disp, size) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x2d, 
(dreg), (basereg), (disp), (size)); \
+       } while(0)
+
+#define x86_64_cvtss2si_reg_memindex_size(inst, dreg, basereg, disp, indexreg, 
shift, size) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x2d, 
(dreg), (basereg), (disp), (indexreg), (shift), (size)); \
+       } while(0)
+
+/*
+ * cvtsd2si: Convert float64 to a signed integer using the rounding mode
+ * in the mxcsr register
+ * The size is the size of the integer value (4 or 8)
+ */
+#define x86_64_cvtsd2si_reg_reg_size(inst, dreg, sxreg, size) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x2d, (dreg), 
(sxreg), (size)); \
+       } while(0)
+
+#define x86_64_cvtsd2si_reg_regp_size(inst, dreg, sregp, size) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x2d, (dreg), 
(sregp), (size)); \
+       } while(0)
+
+#define x86_64_cvtsd2si_reg_mem_size(inst, dreg, mem, size) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x2d, (dreg), 
(mem), (size)); \
+       } while(0)
+
+#define x86_64_cvtsd2si_reg_membase_size(inst, dreg, basereg, disp, size) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x2d, 
(dreg), (basereg), (disp), (size)); \
+       } while(0)
+
+#define x86_64_cvtsd2si_reg_memindex_size(inst, dreg, basereg, disp, indexreg, 
shift, size) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x2d, 
(dreg), (basereg), (disp), (indexreg), (shift), (size)); \
+       } while(0)
+
+/*
  * cvtss2sd: Convert float32 to float64
  */
 #define x86_64_cvtss2sd_reg_reg(inst, dreg, sreg) \
@@ -4008,6 +4193,122 @@
        } while(0)
 
 /*
+ * Compare opcodes
+ */
+
+/*
+ * comiss: Compare ordered scalar single precision values
+ */
+#define x86_64_comiss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_xmm2_reg_reg((inst), 0x0f, 0x2f, (dreg), (sreg)); \
+       } while(0)
+
+#define x86_64_comiss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x2f, (dreg), (sregp)); \
+       } while(0)
+
+#define x86_64_comiss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x2f, (dreg), (mem)); \
+       } while(0)
+
+#define x86_64_comiss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x2f, (dreg), (basereg), 
(disp)); \
+       } while(0)
+
+#define x86_64_comiss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) 
\
+       do { \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x2f, (dreg), (basereg), 
(disp), (indexreg), (shift)); \
+       } while(0)
+
+/*
+ * comisd: Compare ordered scalar double precision values
+ */
+#define x86_64_comisd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x2f, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_comisd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x2f, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_comisd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x2f, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_comisd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x2f, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_comisd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) 
\
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x2f, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * ucomiss: Compare unordered scalar single precision values
+ */
+#define x86_64_ucomiss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_xmm2_reg_reg((inst), 0x0f, 0x2e, (dreg), (sreg)); \
+       } while(0)
+
+#define x86_64_ucomiss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x2e, (dreg), (sregp)); \
+       } while(0)
+
+#define x86_64_ucomiss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x2e, (dreg), (mem)); \
+       } while(0)
+
+#define x86_64_ucomiss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x2e, (dreg), (basereg), 
(disp)); \
+       } while(0)
+
+#define x86_64_ucomiss_reg_memindex(inst, dreg, basereg, disp, indexreg, 
shift) \
+       do { \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x2e, (dreg), (basereg), 
(disp), (indexreg), (shift)); \
+       } while(0)
+
+/*
+ * ucomisd: Compare unordered scalar double precision values
+ */
+#define x86_64_ucomisd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x2e, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_ucomisd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x2e, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_ucomisd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x2e, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_ucomisd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x2e, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_ucomisd_reg_memindex(inst, dreg, basereg, disp, indexreg, 
shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x2e, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
  * Arithmetic opcodes
  */
 
@@ -4124,132 +4425,603 @@
        } while(0)
 
 /*
- * addsd: Add scalar double precision float values
+ * Macros for the logical operations with packed single precision values.
  */
-#define x86_64_addsd_reg_reg(inst, dreg, sreg) \
+#define x86_64_plops_reg_reg(inst, op, dreg, sreg) \
        do { \
-               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(sreg), 0); \
+               x86_64_xmm2_reg_reg((inst), 0x0f, (op), (dreg), (sreg)); \
        } while(0)
 
-#define x86_64_addsd_reg_regp(inst, dreg, sregp) \
+#define x86_64_plops_reg_regp(inst, op, dreg, sregp) \
        do { \
-               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(sregp), 0); \
+               x86_64_xmm2_reg_regp((inst), 0x0f, (op), (dreg), (sregp)); \
        } while(0)
 
-#define x86_64_addsd_reg_mem(inst, dreg, mem) \
+#define x86_64_plops_reg_mem(inst, op, dreg, mem) \
        do { \
-               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(mem), 0); \
+               x86_64_xmm2_reg_mem((inst), 0x0f, (op), (dreg), (mem)); \
        } while(0)
 
-#define x86_64_addsd_reg_membase(inst, dreg, basereg, disp) \
+#define x86_64_plops_reg_membase(inst, op, dreg, basereg, disp) \
        do { \
-               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x58, 
(dreg), (basereg), (disp), 0); \
+               x86_64_xmm2_reg_membase((inst), 0x0f, (op), (dreg), (basereg), 
(disp)); \
        } while(0)
 
-#define x86_64_addsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+#define x86_64_plops_reg_memindex(inst, op, dreg, basereg, disp, indexreg, 
shift) \
        do { \
-               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x58, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, (op), (dreg), (basereg), 
(disp), (indexreg), (shift)); \
        } while(0)
 
 /*
- * subsd: Substract scalar double precision float values
+ * andps: And
  */
-#define x86_64_subsd_reg_reg(inst, dreg, sreg) \
+#define x86_64_andps_reg_reg(inst, dreg, sreg) \
        do { \
-               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(sreg), 0); \
+               x86_64_xmm2_reg_reg((inst), 0x0f, 0x54, (dreg), (sreg)); \
        } while(0)
 
-#define x86_64_subsd_reg_regp(inst, dreg, sregp) \
+#define x86_64_andps_reg_regp(inst, dreg, sregp) \
        do { \
-               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(sregp), 0); \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x54, (dreg), (sregp)); \
        } while(0)
 
-#define x86_64_subsd_reg_mem(inst, dreg, mem) \
+#define x86_64_andps_reg_mem(inst, dreg, mem) \
        do { \
-               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(mem), 0); \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x54, (dreg), (mem)); \
        } while(0)
 
-#define x86_64_subsd_reg_membase(inst, dreg, basereg, disp) \
+#define x86_64_andps_reg_membase(inst, dreg, basereg, disp) \
        do { \
-               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5c, 
(dreg), (basereg), (disp), 0); \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x54, (dreg), (basereg), 
(disp)); \
        } while(0)
 
-#define x86_64_subsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+#define x86_64_andps_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
        do { \
-               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5c, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x54, (dreg), (basereg), 
(disp), (indexreg), (shift)); \
        } while(0)
 
 /*
- * mulsd: Multiply scalar double precision float values
+ * orps: Or
  */
-#define x86_64_mulsd_reg_reg(inst, dreg, sreg) \
+#define x86_64_orps_reg_reg(inst, dreg, sreg) \
        do { \
-               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(sreg), 0); \
+               x86_64_xmm2_reg_reg((inst), 0x0f, 0x56, (dreg), (sreg)); \
        } while(0)
 
-#define x86_64_mulsd_reg_regp(inst, dreg, sregp) \
+#define x86_64_orps_reg_regp(inst, dreg, sregp) \
        do { \
-               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(sregp), 0); \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x56, (dreg), (sregp)); \
        } while(0)
 
-#define x86_64_mulsd_reg_mem(inst, dreg, mem) \
+#define x86_64_orps_reg_mem(inst, dreg, mem) \
        do { \
-               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(mem), 0); \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x56, (dreg), (mem)); \
        } while(0)
 
-#define x86_64_mulsd_reg_membase(inst, dreg, basereg, disp) \
+#define x86_64_orps_reg_membase(inst, dreg, basereg, disp) \
        do { \
-               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x59, 
(dreg), (basereg), (disp), 0); \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x56, (dreg), (basereg), 
(disp)); \
        } while(0)
 
-#define x86_64_mulsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+#define x86_64_orps_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
        do { \
-               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x59, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x56, (dreg), (basereg), 
(disp), (indexreg), (shift)); \
        } while(0)
 
 /*
- * divsd: Divide scalar double precision float values
+ * xorps: Xor
  */
-#define x86_64_divsd_reg_reg(inst, dreg, sreg) \
+#define x86_64_xorps_reg_reg(inst, dreg, sreg) \
        do { \
-               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(sreg), 0); \
+               x86_64_xmm2_reg_reg((inst), 0x0f, 0x57, (dreg), (sreg)); \
        } while(0)
 
-#define x86_64_divsd_reg_regp(inst, dreg, sregp) \
+#define x86_64_xorps_reg_regp(inst, dreg, sregp) \
        do { \
-               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(sregp), 0); \
+               x86_64_xmm2_reg_regp((inst), 0x0f, 0x57, (dreg), (sregp)); \
        } while(0)
 
-#define x86_64_divsd_reg_mem(inst, dreg, mem) \
+#define x86_64_xorps_reg_mem(inst, dreg, mem) \
        do { \
-               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(mem), 0); \
+               x86_64_xmm2_reg_mem((inst), 0x0f, 0x57, (dreg), (mem)); \
        } while(0)
 
-#define x86_64_divsd_reg_membase(inst, dreg, basereg, disp) \
+#define x86_64_xorps_reg_membase(inst, dreg, basereg, disp) \
        do { \
-               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5e, 
(dreg), (basereg), (disp), 0); \
+               x86_64_xmm2_reg_membase((inst), 0x0f, 0x57, (dreg), (basereg), 
(disp)); \
        } while(0)
 
-#define x86_64_divsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+#define x86_64_xorps_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
        do { \
-               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5e, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+               x86_64_xmm2_reg_memindex((inst), 0x0f, 0x57, (dreg), (basereg), 
(disp), (indexreg), (shift)); \
        } while(0)
 
 /*
- * fpu instructions
+ * maxss: Maximum value
  */
+#define x86_64_maxss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x5f, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_maxss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x5f, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_maxss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x5f, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_maxss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x5f, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_maxss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x5f, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
 
 /*
- * fld
+ * minss: Minimum value
  */
+#define x86_64_minss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x5d, (dreg), 
(sreg), 0); \
+       } while(0)
 
-#define x86_64_fld_regp_size(inst, sregp, size) \
+#define x86_64_minss_reg_regp(inst, dreg, sregp) \
        do { \
-               x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \
-               switch(size) \
-               { \
-                       case 4: \
-                       { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x5d, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_minss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x5d, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_minss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x5d, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_minss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x5d, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * sqrtss: Square root
+ */
+#define x86_64_sqrtss_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf3, 0x0f, 0x51, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_sqrtss_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf3, 0x0f, 0x51, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_sqrtss_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf3, 0x0f, 0x51, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_sqrtss_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf3, 0x0f, 0x51, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_sqrtss_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) 
\
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf3, 0x0f, 0x51, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+
+/*
+ * Macros for the logical operations with packed double precision values.
+ */
+#define x86_64_plopd_reg_reg(inst, op, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, (op), (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_plopd_reg_regp(inst, op, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, (op), (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_plopd_reg_mem(inst, op, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, (op), (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_plopd_reg_membase(inst, op, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, (op), 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_plopd_reg_memindex(inst, op, dreg, basereg, disp, indexreg, 
shift) \
+       do { \
+               x86_64_xmm2_reg_memindex_size((inst), 0x66, 0x0f, (op), (dreg), 
(basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * addsd: Add scalar double precision float values
+ */
+#define x86_64_addsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_addsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_addsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x58, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_addsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x58, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_addsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x58, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * subsd: Substract scalar double precision float values
+ */
+#define x86_64_subsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_subsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_subsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5c, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_subsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5c, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_subsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5c, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * mulsd: Multiply scalar double precision float values
+ */
+#define x86_64_mulsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_mulsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_mulsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x59, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_mulsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x59, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_mulsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x59, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * divsd: Divide scalar double precision float values
+ */
+#define x86_64_divsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_divsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_divsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5e, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_divsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5e, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_divsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5e, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * andpd: And
+ */
+#define x86_64_andpd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x54, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_andpd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x54, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_andpd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x54, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_andpd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x54, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_andpd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x54, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * orpd: Or
+ */
+#define x86_64_orpd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x56, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_orpd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x56, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_orpd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x56, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_orpd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x56, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_orpd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x56, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * xorpd: Xor
+ */
+#define x86_64_xorpd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0x66, 0x0f, 0x57, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_xorpd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0x66, 0x0f, 0x57, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_xorpd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0x66, 0x0f, 0x57, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_xorpd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0x66, 0x0f, 0x57, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_xorpd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0x66, 0x0f, 0x57, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * maxsd: Maximum value
+ */
+#define x86_64_maxsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5f, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_maxsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5f, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_maxsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5f, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_maxsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5f, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_maxsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x5f, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * minsd: Minimum value
+ */
+#define x86_64_minsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x5d, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_minsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x5d, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_minsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x5d, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_minsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x5d, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_minsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) \
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2 0x0f, 0x5d, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * sqrtsd: Square root
+ */
+#define x86_64_sqrtsd_reg_reg(inst, dreg, sreg) \
+       do { \
+               x86_64_p1_xmm2_reg_reg_size((inst), 0xf2, 0x0f, 0x51, (dreg), 
(sreg), 0); \
+       } while(0)
+
+#define x86_64_sqrtsd_reg_regp(inst, dreg, sregp) \
+       do { \
+               x86_64_p1_xmm2_reg_regp_size((inst), 0xf2, 0x0f, 0x51, (dreg), 
(sregp), 0); \
+       } while(0)
+
+#define x86_64_sqrtsd_reg_mem(inst, dreg, mem) \
+       do { \
+               x86_64_p1_xmm2_reg_mem_size((inst), 0xf2, 0x0f, 0x51, (dreg), 
(mem), 0); \
+       } while(0)
+
+#define x86_64_sqrtsd_reg_membase(inst, dreg, basereg, disp) \
+       do { \
+               x86_64_p1_xmm2_reg_membase_size((inst), 0xf2, 0x0f, 0x51, 
(dreg), (basereg), (disp), 0); \
+       } while(0)
+
+#define x86_64_sqrtsd_reg_memindex(inst, dreg, basereg, disp, indexreg, shift) 
\
+       do { \
+               x86_64_p1_xmm2_reg_memindex_size((inst), 0xf2, 0x0f, 0x51, 
(dreg), (basereg), (disp), (indexreg), (shift), 0); \
+       } while(0)
+
+/*
+ * Rounding: Available in SSE 4.1 only
+ */
+
+/*
+ * roundss: Round scalar single precision value
+ */
+#define x86_64_roundss_reg_reg(inst, dreg, sreg, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_reg_size((inst), 0x66, 0x0f, 0x3a, 0x0a, 
(dreg), (sreg), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+#define x86_64_roundss_reg_regp(inst, dreg, sregp, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_regp_size((inst), 0x66, 0x0f, 0x3a, 0x0a, 
(dreg), (sregp), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+#define x86_64_roundss_reg_mem(inst, dreg, mem, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_mem_size((inst), 0x66, 0x0f, 0x3a, 0x0a, 
(dreg), (mem), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+#define x86_64_roundss_reg_membase(inst, dreg, basereg, disp, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_membase_size((inst), 0x66, 0x0f, 0x3a, 0x0a, 
(dreg), (basereg), (disp), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+#define x86_64_roundss_reg_memindex(inst, dreg, basereg, disp, indexreg, 
shift, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_memindex_size((inst), 0x66, 0x0f, 0x3a, 
0x0a, (dreg), (basereg), (disp), (indexreg), (shift), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+/*
+ * roundsd: Round scalar double precision value
+ */
+#define x86_64_roundsd_reg_reg(inst, dreg, sreg, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_reg_size((inst), 0x66, 0x0f, 0x3a, 0x0b, 
(dreg), (sreg), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+#define x86_64_roundsd_reg_regp(inst, dreg, sregp, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_regp_size((inst), 0x66, 0x0f, 0x3a, 0x0b, 
(dreg), (sregp), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+#define x86_64_roundsd_reg_mem(inst, dreg, mem, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_mem_size((inst), 0x66, 0x0f, 0x3a, 0x0b, 
(dreg), (mem), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+#define x86_64_roundsd_reg_membase(inst, dreg, basereg, disp, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_membase_size((inst), 0x66, 0x0f, 0x3a, 0x0b, 
(dreg), (basereg), (disp), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+#define x86_64_roundsd_reg_memindex(inst, dreg, basereg, disp, indexreg, 
shift, mode) \
+       do { \
+               x86_64_p1_xmm3_reg_memindex_size((inst), 0x66, 0x0f, 0x3a, 
0x0b, (dreg), (basereg), (disp), (indexreg), (shift), 0); \
+               x86_imm_emit8((inst), (mode)); \
+       } while(0)
+
+/*
+ * Clear xmm register
+ */
+#define x86_64_clear_xreg(inst, reg) \
+       do { \
+               x86_64_xorps_reg_reg((inst), (reg), (reg)); \
+       } while(0)
+
+/*
+ * fpu instructions
+ */
+
+/*
+ * fld
+ */
+
+#define x86_64_fld_regp_size(inst, sregp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \
+               switch(size) \
+               { \
+                       case 4: \
+                       { \
                            *(inst)++ = (unsigned char)0xd9; \
                                x86_64_regp_emit((inst), 0, (sregp)); \
                        } \
@@ -4349,7 +5121,7 @@
 /*
  * fild: Load an integer and convert it to long double
  */
-#define x86_fild_mem_size(inst, mem, size) \
+#define x86_64_fild_mem_size(inst, mem, size) \
        do { \
                switch(size) \
                { \
@@ -4374,7 +5146,7 @@
                } \
        } while (0)
 
-#define x86_fild_membase_size(inst, mem, size) \
+#define x86_64_fild_membase_size(inst, basereg, disp, size) \
        do { \
                x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \
                switch(size) \
@@ -4486,7 +5258,6 @@
 /*
  * fstp: store top fpu register to memory and pop it from the fpu stack
  */
-
 #define x86_64_fstp_regp_size(inst, sregp, size) \
        do { \
                x86_64_rex_emit((inst), 0, 0, 0, (sregp)); \
@@ -4591,7 +5362,7 @@
        } while(0)
 
 /*
- * Convert long double to integer
+ * fistp: Convert long double to integer
  */
 #define x86_64_fistp_mem_size(inst, mem, size) \
        do { \
@@ -4618,8 +5389,35 @@
                } \
        } while(0)
 
+#define x86_64_fistp_regp_size(inst, dregp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (dregp)); \
+               switch((size)) \
+               { \
+                       case 2: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_regp_emit((inst), 3, (dregp)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_regp_emit((inst), 3, (dregp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_regp_emit((inst), 7, (dregp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
 #define x86_64_fistp_membase_size(inst, basereg, disp, size) \
        do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \
                switch((size)) \
                { \
                        case 2: \
@@ -4644,13 +5442,114 @@
        } while(0)
 
 /*
+ * frndint: Round st(0) to integer according to the rounding mode set in the 
fpu control word.
+ */
+#define x86_64_frndint(inst) \
+       do { \
+               *(inst)++ = (unsigned char)0xd9; \
+               *(inst)++ = (unsigned char)0xfc; \
+       } while(0)
+
+/*
+ * fisttp: Convert long double to integer using truncation as rounding mode 
Available in SSE 3 only
+ */
+#define x86_64_fisttp_regp_size(inst, dregp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (dregp)); \
+               switch((size)) \
+               { \
+                       case 2: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_regp_emit((inst), 1, (dregp)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_regp_emit((inst), 1, (dregp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_regp_emit((inst), 1, (dregp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fisttp_mem_size(inst, mem, size) \
+       do { \
+               switch((size)) \
+               { \
+                       case 2: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_mem_emit((inst), 1, (mem)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_mem_emit((inst), 1, (mem)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_mem_emit((inst), 1, (mem)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fisttp_membase_size(inst, basereg, disp, size) \
+       do { \
+               x86_64_rex_emit((inst), 0, 0, 0, (basereg)); \
+               switch((size)) \
+               { \
+                       case 2: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdf; \
+                               x86_64_membase_emit((inst), 1, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 4: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdb; \
+                               x86_64_membase_emit((inst), 1, (basereg), 
(disp)); \
+                       } \
+                       break; \
+                       case 8: \
+                       { \
+                               *(inst)++ = (unsigned char)0xdd; \
+                               x86_64_membase_emit((inst), 1, (basereg), 
(disp)); \
+                       } \
+                       break; \
+               } \
+       } while(0)
+
+#define x86_64_fabs(inst) \
+       do { \
+               *(inst)++ = (unsigned char)0xd9; \
+               *(inst)++ = (unsigned char)0xe1; \
+       } while(0)
+
+#define x86_64_fchs(inst)      \
+       do {    \
+               *(inst)++ = (unsigned char)0xd9;        \
+               *(inst)++ = (unsigned char)0xe0;        \
+       } while(0)
+
+/*
  * Store fpu control word after checking for pending unmasked fpu exceptions
  */
 #define x86_64_fnstcw(inst, mem) \
        do { \
                *(inst)++ = (unsigned char)0xd9; \
                x86_64_mem_emit((inst), 7, (mem)); \
-       } while (0)
+       } while(0)
 
 #define x86_64_fnstcw_membase(inst, basereg, disp) \
        do { \

Index: jit/jit-rules-x86-64.c
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-rules-x86-64.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- jit/jit-rules-x86-64.c      24 Mar 2008 12:42:51 -0000      1.2
+++ jit/jit-rules-x86-64.c      13 Apr 2008 17:55:36 -0000      1.3
@@ -104,6 +104,22 @@
 #define HAVE_RED_ZONE 1
 
 /*
+ * Some declarations that should be replaced by querying the cpuinfo
+ * if generating code for the current cpu.
+ */
+/*
+#define HAVE_X86_SSE_4_1 0
+#define HAVE_X86_SSE_4 0
+#define HAVE_X86_SSE_3 0
+#define HAVE_X86_FISTTP 0
+*/
+
+#define        TODO() \
+do { \
+       fprintf(stderr, "TODO at %s, %d\n", __FILE__, (int)__LINE__); \
+} while(0)
+
+/*
  * Setup or teardown the x86 code output process.
  */
 #define        jit_cache_setup_output(needed)  \
@@ -165,6 +181,9 @@
 static _jit_regclass_t *x86_64_reg;            /* X86_64 general purpose 
registers */
 static _jit_regclass_t *x86_64_creg;   /* X86_64 call clobbered general */
                                                                                
/* purpose registers */
+static _jit_regclass_t *x86_64_dreg;   /* general purpose registers that */
+                                                                               
/* can be used as divisor */
+                                                                               
/* (all but %rax and %rdx) */
 static _jit_regclass_t *x86_64_rreg;   /* general purpose registers not used*/
                                                                                
/* for returning values */
 static _jit_regclass_t *x86_64_sreg;   /* general purpose registers that can*/
@@ -196,6 +215,16 @@
                X86_64_REG_R9, X86_64_REG_R10,
                X86_64_REG_R11);
 
+       /* r egister class for divisors */
+       x86_64_dreg = _jit_regclass_create(
+               "dreg", JIT_REG_WORD | JIT_REG_LONG, 12,
+               X86_64_REG_RCX, X86_64_REG_RBX,
+               X86_64_REG_RSI, X86_64_REG_RDI,
+               X86_64_REG_R8, X86_64_REG_R9,
+               X86_64_REG_R10, X86_64_REG_R11,
+               X86_64_REG_R12, X86_64_REG_R13,
+               X86_64_REG_R14, X86_64_REG_R15);
+
        /* register class with all registers not used for returning values */
        x86_64_rreg = _jit_regclass_create(
                "rreg", JIT_REG_WORD | JIT_REG_LONG, 12,
@@ -341,6 +370,452 @@
 }
 
 /*
+ * Do a logical xmm operation with packed float32 values
+ */
+static int
+_jit_plops_reg_imm(jit_gencode_t gen, unsigned char **inst_ptr,
+                                  X86_64_XMM_PLOP opc, int reg, void 
*packed_value)
+{
+       void *ptr;
+       jit_nint offset;
+       unsigned char *inst;
+
+       inst = *inst_ptr;
+       ptr = _jit_cache_alloc(&(gen->posn), 16);
+       if(!ptr)
+       {
+               return 0;
+       }
+       jit_memcpy(ptr, packed_value, 16);
+
+       /* calculate the offset for membase addressing */
+       offset = (jit_nint)ptr - ((jit_nint)inst + (reg > 7 ? 8 : 7));
+       if((offset >= jit_min_int) && (offset <= jit_max_int))
+       {
+               /* We can use RIP relative addressing here */
+               x86_64_plops_reg_membase(inst, opc, reg, X86_64_RIP, offset);
+               *inst_ptr = inst;
+               return 1;
+       }
+       /* Check if mem addressing can be used */
+       if(((jit_nint)ptr >= jit_min_int) &&
+               ((jit_nint)ptr <= jit_max_int))
+       {
+               /* We can use absolute addressing */
+               x86_64_plops_reg_mem(inst, opc, reg, (jit_nint)ptr);
+               *inst_ptr = inst;
+               return 1;
+       }
+       /* We have to use an extra general register */
+       TODO();
+       return 0;
+}
+
+/*
+ * Do a logical xmm operation with packed float64 values
+ */
+static int
+_jit_plopd_reg_imm(jit_gencode_t gen, unsigned char **inst_ptr,
+                                  X86_64_XMM_PLOP opc, int reg, void 
*packed_value)
+{
+       void *ptr;
+       jit_nint offset;
+       unsigned char *inst;
+
+       inst = *inst_ptr;
+       ptr = _jit_cache_alloc(&(gen->posn), 16);
+       if(!ptr)
+       {
+               return 0;
+       }
+       jit_memcpy(ptr, packed_value, 16);
+
+       /* calculate the offset for membase addressing */
+       offset = (jit_nint)ptr - ((jit_nint)inst + (reg > 7 ? 9 : 8));
+       if((offset >= jit_min_int) && (offset <= jit_max_int))
+       {
+               /* We can use RIP relative addressing here */
+               x86_64_plopd_reg_membase(inst, opc, reg, X86_64_RIP, offset);
+               *inst_ptr = inst;
+               return 1;
+       }
+       /* Check if mem addressing can be used */
+       if(((jit_nint)ptr >= jit_min_int) &&
+               ((jit_nint)ptr <= jit_max_int))
+       {
+               /* We can use absolute addressing */
+               x86_64_plopd_reg_mem(inst, opc, reg, (jit_nint)ptr);
+               *inst_ptr = inst;
+               return 1;
+       }
+       /* We have to use an extra general register */
+       TODO();
+       return 0;
+}
+
+/*
+ * Helpers for saving and setting roundmode in the fpu control word
+ * and restoring it afterwards.
+ * The rounding mode bits are bit 10 and 11 in the fpu control word.
+ * sp_offset is the start offset of a temporary eight byte block.
+ */
+static unsigned char *
+_x86_64_set_fpu_roundmode(unsigned char *inst, int scratch_reg,
+                                                 int sp_offset, 
X86_64_ROUNDMODE mode)
+{
+       int fpcw_save_offset = sp_offset + 4;
+       int fpcw_new_offset = sp_offset;
+       int round_mode = ((int)mode) << 10;
+       int round_mode_mask = ~(((int)X86_ROUND_ZERO) << 10);
+
+       /* store FPU control word */
+       x86_64_fnstcw_membase(inst, X86_64_RSP, fpcw_save_offset);
+       /* load the value into the scratch register */
+       x86_64_mov_reg_membase_size(inst, scratch_reg, X86_64_RSP, 
fpcw_save_offset, 2);
+       /* Set the rounding mode */
+       if(mode != X86_ROUND_ZERO)
+       {
+               /* Not all bits are set in the mask so we have to clear it 
first */
+               x86_64_and_reg_imm_size(inst, scratch_reg, round_mode_mask, 2);
+       }
+       x86_64_or_reg_imm_size(inst, scratch_reg, round_mode, 2);
+       /* Store the new round mode */
+       x86_64_mov_membase_reg_size(inst, X86_64_RSP, fpcw_new_offset, 
scratch_reg, 2);
+       /* Now load the new control word */
+       x86_64_fldcw_membase(inst, X86_64_RSP, fpcw_new_offset);
+
+       return inst;
+}
+
+static unsigned char *
+_x86_64_restore_fpcw(unsigned char *inst, int sp_offset)
+{
+       int fpcw_save_offset = sp_offset + 4;
+
+       /* Now load the saved control word */
+       x86_64_fldcw_membase(inst, X86_64_RSP, fpcw_save_offset);
+
+       return inst;
+}
+
+/*
+ * Helpers for saving and setting roundmode in the mxcsr register and
+ * restoring it afterwards.
+ * The rounding mode bits are bit 13 and 14 in the mxcsr register.
+ * sp_offset is the start offset of a temporary eight byte block.
+ */
+static unsigned char *
+_x86_64_set_xmm_roundmode(unsigned char *inst, int scratch_reg,
+                                                 int sp_offset, 
X86_64_ROUNDMODE mode)
+{
+       int mxcsr_save_offset = sp_offset + 4;
+       int mxcsr_new_offset = sp_offset;
+       int round_mode = ((int)mode) << 13;
+       int round_mode_mask = ~(((int)X86_ROUND_ZERO) << 13);
+
+       /* save the mxcsr register */
+       x86_64_stmxcsr_membase(inst, X86_64_RSP, mxcsr_save_offset);
+       /* Load the contents of the mxcsr register into the scratch register */
+       x86_64_mov_reg_membase_size(inst, scratch_reg, X86_64_RSP, 
mxcsr_save_offset, 4);
+       /* Set the rounding mode */
+       if(mode != X86_ROUND_ZERO)
+       {
+               /* Not all bits are set in the mask so we have to clear it 
first */
+               x86_64_and_reg_imm_size(inst, scratch_reg, round_mode_mask, 4);
+       }
+       x86_64_or_reg_imm_size(inst, scratch_reg, round_mode, 4);
+       /* Store the new round mode */
+       x86_64_mov_membase_reg_size(inst, X86_64_RSP, mxcsr_new_offset, 
scratch_reg, 4);
+       /* and load it to the mxcsr register */
+       x86_64_ldmxcsr_membase(inst, X86_64_RSP, mxcsr_new_offset);
+
+       return inst;
+}
+
+static unsigned char *
+_x86_64_restore_mxcsr(unsigned char *inst, int sp_offset)
+{
+       int mxcsr_save_offset = sp_offset + 4;
+
+       /* restore the mxcsr register */
+       x86_64_ldmxcsr_membase(inst, X86_64_RSP, mxcsr_save_offset);
+
+       return inst;
+}
+
+/*
+ * perform rounding of scalar single precision values.
+ * We have to use the fpu where see4.1 is not supported.
+ */
+static unsigned char *
+x86_64_rounds_reg_reg(unsigned char *inst, int dreg, int sreg,
+                                         int scratch_reg, X86_64_ROUNDMODE 
mode)
+{
+#ifdef HAVE_RED_ZONE
+#ifdef HAVE_X86_SSE_4_1
+       x86_64_roundss_reg_reg(inst, dreg, sreg, mode);
+#else
+       /* Copy the xmm register to the stack */
+       x86_64_movss_membase_reg(inst, X86_64_RSP, -16, sreg);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode);
+       /* Load the value to the fpu */
+       x86_64_fld_membase_size(inst, X86_64_RSP, -16, 4);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, -8);
+       /* and move st(0) to the destination register */
+       x86_64_fstp_membase_size(inst, X86_64_RSP, -16, 4);
+       x86_64_movss_reg_membase(inst, dreg, X86_64_RSP, -16);
+#endif
+#else
+#ifdef HAVE_X86_SSE_4_1
+       x86_64_roundss_reg_reg(inst, dreg, sreg, mode);
+#else
+       /* allocate space on the stack for two ints and one long value */
+       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8);
+       /* Copy the xmm register to the stack */
+       x86_64_movss_regp_reg(inst, X86_64_RSP, sreg);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, mode);
+       /* Load the value to the fpu */
+       x86_64_fld_regp_size(inst, X86_64_RSP, 4);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, 8);
+       /* and move st(0) to the destination register */
+       x86_64_fstp_regp_size(inst, X86_64_RSP, 4);
+       x86_64_movss_reg_regp(inst, dreg, X86_64_RSP);
+       /* restore the stack pointer */
+       x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8);
+#endif
+#endif
+       return inst;
+}
+
+static unsigned char *
+x86_64_rounds_reg_membase(unsigned char *inst, int dreg, int offset,
+                                                 int scratch_reg, 
X86_64_ROUNDMODE mode)
+{
+#ifdef HAVE_RED_ZONE
+#ifdef HAVE_X86_SSE_4_1
+       x86_64_roundss_reg_membase(inst, dreg, X86_64_RBP, offset, mode);
+#else
+       /* Load the value to the fpu */
+       x86_64_fld_membase_size(inst, X86_64_RBP, offset, 4);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, -8);
+       /* and move st(0) to the destination register */
+       x86_64_fstp_membase_size(inst, X86_64_RSP, -16, 4);
+       x86_64_movss_reg_membase(inst, dreg, X86_64_RSP, -16);
+#endif
+#else
+#ifdef HAVE_X86_SSE_4_1
+       x86_64_roundss_reg_membase(inst, dreg, X86_64_RBP, offset, mode);
+#else
+       /* allocate space on the stack for two ints and one long value */
+       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8);
+       /* Load the value to the fpu */
+       x86_64_fld_membase_size(inst, X86_64_RBP, offset, 4);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, mode);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, 8);
+       /* and move st(0) to the destination register */
+       x86_64_fstp_regp_size(inst, X86_64_RSP, 4);
+       x86_64_movss_reg_regp(inst, dreg, X86_64_RSP);
+       /* restore the stack pointer */
+       x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8);
+#endif
+#endif
+       return inst;
+}
+
+/*
+ * perform rounding of scalar double precision values.
+ * We have to use the fpu where see4.1 is not supported.
+ */
+static unsigned char *
+x86_64_roundd_reg_reg(unsigned char *inst, int dreg, int sreg,
+                                         int scratch_reg, X86_64_ROUNDMODE 
mode)
+{
+#ifdef HAVE_RED_ZONE
+#ifdef HAVE_X86_SSE_4_1
+       x86_64_roundsd_reg_reg(inst, dreg, sreg, mode);
+#else
+       /* Copy the xmm register to the stack */
+       x86_64_movsd_membase_reg(inst, X86_64_RSP, -16, sreg);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode);
+       /* Load the value to the fpu */
+       x86_64_fld_membase_size(inst, X86_64_RSP, -16, 8);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, -8);
+       /* and move st(0) to the destination register */
+       x86_64_fstp_membase_size(inst, X86_64_RSP, -16, 8);
+       x86_64_movsd_reg_membase(inst, dreg, X86_64_RSP, -16);
+#endif
+#else
+#ifdef HAVE_X86_SSE_4_1
+       x86_64_roundsd_reg_reg(inst, dreg, sreg, mode);
+#else
+       /* allocate space on the stack for two ints and one long value */
+       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8);
+       /* Copy the xmm register to the stack */
+       x86_64_movsd_regp_reg(inst, X86_64_RSP, sreg);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, mode);
+       /* Load the value to the fpu */
+       x86_64_fld_regp_size(inst, X86_64_RSP, 8);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, 8);
+       /* and move st(0) to the destination register */
+       x86_64_fstp_regp_size(inst, X86_64_RSP, 8);
+       x86_64_movsd_reg_regp(inst, dreg, X86_64_RSP);
+       /* restore the stack pointer */
+       x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8);
+#endif
+#endif
+       return inst;
+}
+
+static unsigned char *
+x86_64_roundd_reg_membase(unsigned char *inst, int dreg, int offset,
+                                                 int scratch_reg, 
X86_64_ROUNDMODE mode)
+{
+#ifdef HAVE_RED_ZONE
+#ifdef HAVE_X86_SSE_4_1
+       x86_64_roundsd_reg_membase(inst, dreg, X86_64_RBP, offset, mode);
+#else
+       /* Load the value to the fpu */
+       x86_64_fld_membase_size(inst, X86_64_RBP, offset, 8);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, -8);
+       /* and move st(0) to the destination register */
+       x86_64_fstp_membase_size(inst, X86_64_RSP, -16, 8);
+       x86_64_movsd_reg_membase(inst, dreg, X86_64_RSP, -16);
+#endif
+#else
+#ifdef HAVE_X86_SSE_4_1
+       x86_64_roundsd_reg_membase(inst, dreg, X86_64_RBP, offset, mode);
+#else
+       /* allocate space on the stack for two ints and one long value */
+       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8);
+       /* Load the value to the fpu */
+       x86_64_fld_membase_size(inst, X86_64_RBP, offset, 8);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, mode);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, 8);
+       /* and move st(0) to the destination register */
+       x86_64_fstp_regp_size(inst, X86_64_RSP, 8);
+       x86_64_movsd_reg_regp(inst, dreg, X86_64_RSP);
+       /* restore the stack pointer */
+       x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8);
+#endif
+#endif
+       return inst;
+}
+
+/*
+ * Round the value in St(0) to integer according to the rounding
+ * mode specified.
+ */
+static unsigned char *
+x86_64_roundnf(unsigned char *inst, int scratch_reg, X86_64_ROUNDMODE mode)
+{
+#ifdef HAVE_RED_ZONE
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, mode);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, -8);
+#else
+       /* allocate space on the stack for two ints and one long value */
+       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 0, mode);
+       /* And round it to integer */
+       x86_64_frndint(inst);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, 0);
+       /* restore the stack pointer */
+       x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8);
+#endif
+       return inst;
+}
+
+/*
+ * Round the value in the fpu register st(0) to integer and
+ * store the value in dreg. St(0) is popped from the fpu stack.
+ */
+static unsigned char *
+x86_64_nfloat_to_int(unsigned char *inst, int dreg, int scratch_reg, int size)
+{
+#ifdef HAVE_RED_ZONE
+#ifdef HAVE_X86_FISTTP
+       /* convert float to int */
+       x86_64_fisttp_membase_size(inst, X86_64_RSP, -8, 4);
+       /* move result to the destination */
+       x86_64_mov_reg_membase_size(inst, dreg, X86_64_RSP, -8, 4);
+#else
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, -8, X86_ROUND_ZERO);
+       /* And round the value in st(0) to integer and store it on the stack */
+       x86_64_fistp_membase_size(inst, X86_64_RSP, -16, size);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, -8);
+       /* and load the integer to the destination register */
+       x86_64_mov_reg_membase_size(inst, dreg, X86_64_RSP, -16, size);
+#endif
+#else
+#ifdef HAVE_X86_FISTTP
+       /* allocate space on the stack for one long value */
+       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+       /* convert float to int */
+       x86_64_fisttp_regp_size(inst, X86_64_RSP, 4);
+       /* move result to the destination */
+       x86_64_mov_reg_regp_size(inst, dreg, X86_64_RSP, 4);
+       /* restore the stack pointer */
+       x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8);
+#else
+       /* allocate space on the stack for 2 ints and one long value */
+       x86_64_sub_reg_imm_size(inst, X86_64_RSP, 16, 8);
+       /* Set the fpu round mode */
+       inst = _x86_64_set_fpu_roundmode(inst, scratch_reg, 8, X86_ROUND_ZERO);
+       /* And round the value in st(0) to integer and store it on the stack */
+       x86_64_fistp_regp_size(inst, X86_64_RSP, size);
+       /* restore the fpu control word */
+       inst = _x86_64_restore_fpcw(inst, 8);
+       /* and load the integer to the destination register */
+       x86_64_mov_reg_regp_size(inst, dreg, X86_64_RSP, size);
+       /* restore the stack pointer */
+       x86_64_add_reg_imm_size(inst, X86_64_RSP, 16, 8);
+#endif
+#endif
+       return inst;
+}
+
+/*
  * Call a function
  */
 static unsigned char *
@@ -1049,9 +1524,16 @@
                                {
                                        int xmm_reg = 
_jit_reg_info[reg].cpu_reg;
 
+                                       if(float32_value == (jit_float32) 0.0)
+                                       {
+                                               x86_64_clear_xreg(inst, 
xmm_reg);
+                                       }
+                                       else
+                                       {
                                        _jit_xmm1_reg_imm_size_float32(gen, 
&inst, XMM1_MOV,
                                                                                
                   xmm_reg, &float32_value);
                                }
+                               }
                                else
                                {
                                        if(float32_value == (jit_float32) 0.0)
@@ -1069,7 +1551,7 @@
                                                ptr = 
_jit_cache_alloc(&(gen->posn), sizeof(jit_float32));
                                                jit_memcpy(ptr, &float32_value, 
sizeof(float32_value));
 
-                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 7);
+                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 6);
                                                if((offset >= jit_min_int) && 
(offset <= jit_max_int))
                                                {
                                                        /* We can use RIP 
relative addressing here */
@@ -1084,7 +1566,7 @@
                                                else
                                                {
                                                        /* We have to use an 
extra general register */
-                                                       /* TODO */
+                                                       TODO();
                                                }
                                        }
                                }
@@ -1111,9 +1593,16 @@
                                {
                                        int xmm_reg = 
_jit_reg_info[reg].cpu_reg;
 
+                                       if(float64_value == (jit_float64) 0.0)
+                                       {
+                                               x86_64_clear_xreg(inst, 
xmm_reg);
+                                       }
+                                       else
+                                       {
                                        _jit_xmm1_reg_imm_size_float64(gen, 
&inst, XMM1_MOV,
                                                                                
                   xmm_reg, &float64_value);
                                }
+                               }
                                else
                                {
                                        if(float64_value == (jit_float64) 0.0)
@@ -1131,7 +1620,7 @@
                                                ptr = 
_jit_cache_alloc(&(gen->posn), sizeof(jit_float64));
                                                jit_memcpy(ptr, &float64_value, 
sizeof(float64_value));
 
-                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 7);
+                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 6);
                                                if((offset >= jit_min_int) && 
(offset <= jit_max_int))
                                                {
                                                        /* We can use RIP 
relative addressing here */
@@ -1146,7 +1635,7 @@
                                                else
                                                {
                                                        /* We have to use an 
extra general register */
-                                                       /* TODO */
+                                                       TODO();
                                                }
                                        }
                                }
@@ -1192,7 +1681,7 @@
                                        else
                                        {
                                                /* We have to use an extra 
general register */
-                                               /* TODO */
+                                               TODO();
                                        }
                                }
                                else
@@ -1212,7 +1701,7 @@
                                                ptr = 
_jit_cache_alloc(&(gen->posn), sizeof(jit_nfloat));
                                                jit_memcpy(ptr, &nfloat_value, 
sizeof(nfloat_value));
 
-                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 7);
+                                               offset = (jit_nint)ptr - 
((jit_nint)inst + 6);
                                                if((offset >= jit_min_int) && 
(offset <= jit_max_int))
                                                {
                                                        /* We can use RIP 
relative addressing here */
@@ -1241,7 +1730,7 @@
                                                else
                                                {
                                                        /* We have to use an 
extra general register */
-                                                       /* TODO */
+                                                       TODO();
                                                }
                                        }
                                }
@@ -2315,11 +2804,6 @@
        return inst;
 }
 
-#define        TODO()          \
-       do { \
-               fprintf(stderr, "TODO at %s, %d\n", __FILE__, (int)__LINE__); \
-       } while (0)
-
 void
 _jit_gen_insn(jit_gencode_t gen, jit_function_t func,
                          jit_block_t block, jit_insn_t insn)

Index: jit/jit-rules-x86-64.ins
===================================================================
RCS file: /cvsroot/dotgnu-pnet/libjit/jit/jit-rules-x86-64.ins,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- jit/jit-rules-x86-64.ins    30 Mar 2008 15:05:14 -0000      1.3
+++ jit/jit-rules-x86-64.ins    13 Apr 2008 17:55:36 -0000      1.4
@@ -22,6 +22,7 @@
  
 %regclass reg x86_64_reg
 %regclass creg x86_64_creg
+%regclass dreg x86_64_dreg
 %regclass rreg x86_64_rreg
 %regclass sreg x86_64_sreg
 %regclass freg x86_64_freg
@@ -91,62 +92,106 @@
                x86_64_mov_reg_reg_size(inst, $1, $2, 4);
        }
 
+JIT_OP_INT_TO_NFLOAT:
+       [=freg, local] -> {
+               x86_64_fild_membase_size(inst, X86_64_RBP, $2, 4);
+       }
+       [=freg, reg] -> {
+#ifdef HAVE_RED_ZONE
+               x86_64_mov_membase_reg_size(inst, X86_64_RSP, -8, $2, 4);
+               x86_64_fild_membase_size(inst, X86_64_RSP, -8, 4);
+#else
+               x86_64_push_reg_size(inst, $2, 8);
+               x86_64_fild_membase_size(inst, X86_64_RSP, 0, 4);
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, sizeof(jit_nint), 8);
+#endif
+       }
+
+JIT_OP_LONG_TO_NFLOAT:
+       [=freg, local] -> {
+               x86_64_fild_membase_size(inst, X86_64_RBP, $2, 8);
+       }
+       [=freg, reg] -> {
+#ifdef HAVE_RED_ZONE
+               x86_64_mov_membase_reg_size(inst, X86_64_RSP, -8, $2, 8);
+               x86_64_fild_membase_size(inst, X86_64_RSP, -8, 8);
+#else
+               x86_64_push_reg_size(inst, $2, 8);
+               x86_64_fild_membase_size(inst, X86_64_RSP, 0, 8);
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, sizeof(jit_nint), 8);
+#endif
+       }
+
 JIT_OP_NFLOAT_TO_INT: stack
-       [=reg, freg] -> {
-               /* allocate space on the stack for 2 shorts and 1 int */
+       [=reg, freg, scratch reg] -> {
+               inst = x86_64_nfloat_to_int(inst, $1, $3, 4);
+       }
+
+JIT_OP_NFLOAT_TO_LONG: stack
+       [=reg, freg, scratch reg] -> {
+               inst = x86_64_nfloat_to_int(inst, $1, $3, 8);
+       }
+
+JIT_OP_FLOAT32_TO_NFLOAT:
+       [=freg, local] -> {
+               x86_64_fld_membase_size(inst, X86_64_RBP, $2, 4);
+       }
+       [=freg, xreg] -> {
+#ifdef HAVE_RED_ZONE
+               x86_64_movss_membase_reg(inst, X86_64_RSP, -8, $2);
+               x86_64_fld_membase_size(inst, X86_64_RSP, -8, 4);
+#else
                x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
-               /* store FPU control word */
-               x86_64_fnstcw_membase(inst, X86_64_RSP, 0);
-               /* set "round toward zero" mode */
-               x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 0, 2);
-               x86_64_or_reg_imm_size(inst, $1, 0xc00, 2);
-               x86_64_mov_membase_reg_size(inst, X86_64_RSP, 2, $1, 2);
-               x86_64_fldcw_membase(inst, X86_64_RSP, 2);
-               /* convert float to int */
-               x86_64_fistp_membase_size(inst, X86_64_RSP, 4, 4);
-               /* restore FPU control word */
-               x86_64_fldcw_membase(inst, X86_64_RSP, 0);
-               /* move result to the destination */
-               x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 4, 4);
-               /* restore the stack */
+               x86_64_movss_regp_reg(inst, X86_64_RSP, $2);
+               x86_64_fld_regp_size(inst, X86_64_RSP, 4);
                x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8);
+#endif
        }
 
-JIT_OP_NFLOAT_TO_LONG: stack
-       [=reg, freg] -> {
-               /* allocate space on the stack for 2 shorts and 1 long */
-               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 12, 8);
-               /* store FPU control word */
-               x86_64_fnstcw_membase(inst, X86_64_RSP, 0);
-               /* set "round toward zero" mode */
-               x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 0, 2);
-               x86_64_or_reg_imm_size(inst, $1, 0xc00, 2);
-               x86_64_mov_membase_reg_size(inst, X86_64_RSP, 2, $1, 2);
-               x86_64_fldcw_membase(inst, X86_64_RSP, 2);
-               /* convert float to long */
-               x86_64_fistp_membase_size(inst, X86_64_RSP, 4, 8);
-               /* restore FPU control word */
-               x86_64_fldcw_membase(inst, X86_64_RSP, 0);
-               /* move result to the destination */
-               x86_64_mov_reg_membase_size(inst, $1, X86_64_RSP, 4, 8);
-               /* restore the stack */
-               x86_64_add_reg_imm_size(inst, X86_64_RSP, 12, 8);
+JIT_OP_FLOAT64_TO_NFLOAT:
+       [=freg, local] -> {
+               x86_64_fld_membase_size(inst, X86_64_RBP, $2, 8);
+       }
+       [=freg, xreg] -> {
+#ifdef HAVE_RED_ZONE
+               x86_64_movsd_membase_reg(inst, X86_64_RSP, -8, $2);
+               x86_64_fld_membase_size(inst, X86_64_RSP, -8, 8);
+#else
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_movsd_regp_reg(inst, X86_64_RSP, $2);
+               x86_64_fld_regp_size(inst, X86_64_RSP, 8);
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8);
+#endif
        }
 
 JIT_OP_NFLOAT_TO_FLOAT32: stack
        [=xreg, freg] -> {
+#ifdef HAVE_RED_ZONE
                /* Avoid modifying the stack pointer by simply using negative */
                /* offsets here. */
                x86_64_fstp_membase_size(inst, X86_64_RSP, -8, 4);
                x86_64_movss_reg_membase(inst, $1, X86_64_RSP, -8);
+#else
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_fstp_regp_size(inst, X86_64_RSP, 4);
+               x86_64_movss_reg_regp(inst, $1, X86_64_RSP);
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8);
+#endif
        }
 
 JIT_OP_NFLOAT_TO_FLOAT64: stack
        [=xreg, freg] -> {
+#ifdef HAVE_RED_ZONE
                /* Avoid modifying the stack pointer by simply using negative */
                /* offsets here. */
                x86_64_fstp_membase_size(inst, X86_64_RSP, -8, 8);
                x86_64_movsd_reg_membase(inst, $1, X86_64_RSP, -8);
+#else
+               x86_64_sub_reg_imm_size(inst, X86_64_RSP, 8, 8);
+               x86_64_fstp_regp_size(inst, X86_64_RSP, 8);
+               x86_64_movsd_reg_regp(inst, $1, X86_64_RSP);
+               x86_64_add_reg_imm_size(inst, X86_64_RSP, 8, 8);
+#endif
        }
 
 /*
@@ -894,12 +939,12 @@
                x86_64_cmov_reg_reg_size(inst, X86_CC_S, $1, $3, 1, 4);
                x86_64_sar_reg_imm_size(inst, $1, shift, 4);
        }
-       [reg("rax"), imm, scratch reg, scratch reg("rdx")] -> {
+       [reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> {
                x86_64_mov_reg_imm_size(inst, $3, $2, 4);
                x86_64_cdq(inst);
                x86_64_idiv_reg_size(inst, $3, 4);
        }
-       [reg("rax"), reg, scratch reg("rdx")] -> {
+       [reg("rax"), dreg, scratch reg("rdx")] -> {
                jit_int min_int = jit_min_int;
                unsigned char *patch, *patch2;
 #ifndef JIT_USE_SIGNALS
@@ -937,12 +982,12 @@
                }
                x86_64_shr_reg_imm_size(inst, $1, shift, 4);
        }
-       [reg("rax"), imm, scratch reg, scratch reg("rdx")] -> {
+       [reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> {
                x86_64_mov_reg_imm_size(inst, $3, $2, 4);
                x86_64_clear_reg(inst, X86_64_RDX);
                x86_64_div_reg_size(inst, $3, 4);
        }
-       [reg("rax"), reg, scratch reg("rdx")] -> {
+       [reg("rax"), dreg, scratch reg("rdx")] -> {
 #ifndef JIT_USE_SIGNALS
                unsigned char *patch;
                x86_64_test_reg_reg_size(inst, $2, $2, 4);
@@ -974,12 +1019,12 @@
                x86_patch(patch, inst);
                x86_64_clear_reg(inst, $1);
        }
-       [=reg("rdx"), *reg("rax"), imm, scratch reg, scratch reg("rdx")] -> {
+       [=reg("rdx"), *reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> {
                x86_64_mov_reg_imm_size(inst, $4, $3, 4);
                x86_64_cdq(inst);
                x86_64_idiv_reg_size(inst, $4, 4);
        }
-       [=reg("rdx"), *reg("rax"), reg, scratch reg("rdx")] -> {
+       [=reg("rdx"), *reg("rax"), dreg, scratch reg("rdx")] -> {
                jit_int min_int = jit_min_int;
                unsigned char *patch, *patch2;
 #ifndef JIT_USE_SIGNALS
@@ -1009,16 +1054,16 @@
        [reg, imm, if("$2 == 1")] -> {
                x86_64_clear_reg(inst, $1);
        }
-       [reg, imm, if("(((jit_nuint)$2) & (((jit_nuint)$2) - 1)) == 0")] -> {
+       [reg, imm, if("($2 & ($2 - 1)) == 0")] -> {
                /* x & (x - 1) is equal to zero if x is a power of 2  */
                x86_64_and_reg_imm_size(inst, $1, $2 - 1, 4);
        }
-       [=reg("rdx"), *reg("rax"), imm, scratch reg, scratch reg("rdx")] -> {
+       [=reg("rdx"), *reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> {
                x86_64_mov_reg_imm_size(inst, $4, $3, 4);
                x86_64_clear_reg(inst, X86_64_RDX);
                x86_64_div_reg_size(inst, $4, 4);
        }
-       [=reg("rdx"), *reg("rax"), reg, scratch reg("rdx")] -> {
+       [=reg("rdx"), *reg("rax"), dreg, scratch reg("rdx")] -> {
 #ifndef JIT_USE_SIGNALS
                unsigned char *patch;
                x86_64_test_reg_reg_size(inst, $3, $3, 4);
@@ -1170,12 +1215,12 @@
                x86_64_cmov_reg_reg_size(inst, X86_CC_S, $1, $3, 1, 8);
                x86_64_sar_reg_imm_size(inst, $1, shift, 8);
        }
-       [reg("rax"), imm, scratch reg, scratch reg("rdx")] -> {
+       [reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> {
                x86_64_mov_reg_imm_size(inst, $3, $2, 8);
                x86_64_cqo(inst);
                x86_64_idiv_reg_size(inst, $3, 8);
        }
-       [reg("rax"), reg, scratch reg("rdx")] -> {
+       [reg("rax"), dreg, scratch reg("rdx")] -> {
                jit_long min_long = jit_min_long;
                unsigned char *patch, *patch2;
 #ifndef JIT_USE_SIGNALS
@@ -1214,12 +1259,12 @@
                }
                x86_64_shr_reg_imm_size(inst, $1, shift, 8);
        }
-       [reg("rax"), imm, scratch reg, scratch reg("rdx")] -> {
+       [reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> {
                x86_64_mov_reg_imm_size(inst, $3, $2, 8);
                x86_64_clear_reg(inst, X86_64_RDX);
                x86_64_div_reg_size(inst, $3, 8);
        }
-       [reg("rax"), reg, scratch reg("rdx")] -> {
+       [reg("rax"), dreg, scratch reg("rdx")] -> {
 #ifndef JIT_USE_SIGNALS
                unsigned char *patch;
                x86_64_test_reg_reg_size(inst, $2, $2, 8);
@@ -1251,12 +1296,12 @@
                x86_patch(patch, inst);
                x86_64_clear_reg(inst, $1);
        }
-       [=reg("rdx"), *reg("rax"), imm, scratch reg, scratch reg("rdx")] -> {
+       [=reg("rdx"), *reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> {
                x86_64_mov_reg_imm_size(inst, $4, $3, 8);
                x86_64_cqo(inst);
                x86_64_idiv_reg_size(inst, $4, 8);
        }
-       [=reg("rdx"), *reg("rax"), reg, scratch reg("rdx")] -> {
+       [=reg("rdx"), *reg("rax"), dreg, scratch reg("rdx")] -> {
                jit_long min_long = jit_min_long;
                unsigned char *patch, *patch2;
 #ifndef JIT_USE_SIGNALS
@@ -1301,12 +1346,12 @@
                        x86_64_and_reg_reg_size(inst, $1, $3, 8);
                }
        }
-       [=reg("rdx"), *reg("rax"), imm, scratch reg, scratch reg("rdx")] -> {
+       [=reg("rdx"), *reg("rax"), imm, scratch dreg, scratch reg("rdx")] -> {
                x86_64_mov_reg_imm_size(inst, $4, $3, 8);
                x86_64_clear_reg(inst, X86_64_RDX);
                x86_64_div_reg_size(inst, $4, 8);
        }
-       [=reg("rdx"), *reg("rax"), reg, scratch reg("rdx")] -> {
+       [=reg("rdx"), *reg("rax"), dreg, scratch reg("rdx")] -> {
 #ifndef JIT_USE_SIGNALS
                unsigned char *patch;
                x86_64_test_reg_reg_size(inst, $3, $3, 8);
@@ -1367,6 +1412,22 @@
                x86_64_divss_reg_membase(inst, $1, X86_64_RBP, $2);
        }
 
+JIT_OP_FABS:
+       [xreg] -> {
+               /* Simply clear the sign */
+               jit_uint values[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 
0x7fffffff};
+
+               _jit_plops_reg_imm(gen, &inst, XMM_ANDP, $1, &(values[0]));
+       }
+
+JIT_OP_FNEG:
+       [xreg] -> {
+               /* Simply toggle the sign */
+               jit_uint values[4] = {0x80000000, 0x80000000, 0x80000000, 
0x80000000};
+
+               _jit_plops_reg_imm(gen, &inst, XMM_XORP, $1, &(values[0]));
+       }
+
 /*
  * double precision float versions
  */
@@ -1415,6 +1476,35 @@
                x86_64_divsd_reg_reg(inst, $1, $2);
        }
 
+JIT_OP_DABS:
+       [xreg] -> {
+               /* Simply clear the sign */
+               jit_ulong values[2] = {0x7fffffffffffffff, 0x7fffffffffffffff};
+
+               _jit_plopd_reg_imm(gen, &inst, XMM_ANDP, $1, &(values[0]));
+       }
+
+JIT_OP_DNEG:
+       [xreg] -> {
+               /* Simply toggle the sign */
+               jit_ulong values[2] = {0x8000000000000000, 0x8000000000000000};
+
+               _jit_plopd_reg_imm(gen, &inst, XMM_XORP, $1, &(values[0]));
+       }
+
+/*
+ * native float versions
+ */
+JIT_OP_NFABS: stack
+       [freg] -> {
+               x86_64_fabs(inst);
+       }
+
+JIT_OP_NFNEG:  stack
+       [freg] -> {
+               x86_64_fchs(inst);
+       }
+
 /*
  * Bitwise opcodes.
  */
@@ -1872,13 +1962,133 @@
                inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
        }
 
+JIT_OP_BR_FEQ:
+       [xreg, local] -> {
+               x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+
+JIT_OP_BR_FNE:
+       [xreg, local] -> {
+               x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+
+JIT_OP_BR_FLT:
+       [xreg, local] -> {
+               x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+
+JIT_OP_BR_FLE:
+       [xreg, local] -> {
+               x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+
+JIT_OP_BR_FGT:
+       [xreg, local] -> {
+               x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+
+JIT_OP_BR_FGE:
+       [xreg, local] -> {
+               x86_64_comiss_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+
+JIT_OP_BR_DEQ:
+       [xreg, local] -> {
+               x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x74 /* eq */, insn);
+       }
+
+JIT_OP_BR_DNE:
+       [xreg, local] -> {
+               x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x75 /* ne */, insn);
+       }
+
+JIT_OP_BR_DLT:
+       [xreg, local] -> {
+               x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x72 /* lt_un */, insn);
+       }
+
+JIT_OP_BR_DLE:
+       [xreg, local] -> {
+               x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x76 /* le_un */, insn);
+       }
+
+JIT_OP_BR_DGT:
+       [xreg, local] -> {
+               x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x77 /* gt_un */, insn);
+       }
+
+JIT_OP_BR_DGE:
+       [xreg, local] -> {
+               x86_64_comisd_reg_membase(inst, $1, X86_64_RBP, $2);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+       [xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $1, $2);
+               inst = output_branch(func, inst, 0x73 /* ge_un */, insn);
+       }
+
 /*
  * Comparison opcodes.
  */
 
 JIT_OP_IEQ:
        [=reg, reg, immzero] -> {
-               x86_64_or_reg_reg_size(inst, $2, $2, 4);
+               x86_64_test_reg_reg_size(inst, $2, $2, 4);
                inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
        }
        [=reg, reg, imm] -> {
@@ -1896,7 +2106,7 @@
 
 JIT_OP_INE:
        [=reg, reg, immzero] -> {
-               x86_64_or_reg_reg_size(inst, $2, $2, 4);
+               x86_64_test_reg_reg_size(inst, $2, $2, 4);
                inst = setcc_reg(inst, $1, X86_CC_NE, 0);
        }
        [=reg, reg, imm] -> {
@@ -2026,7 +2236,7 @@
 
 JIT_OP_LEQ:
        [=reg, reg, immzero] -> {
-               x86_64_or_reg_reg_size(inst, $2, $2, 8);
+               x86_64_test_reg_reg_size(inst, $2, $2, 8);
                inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
        }
        [=reg, reg, imm, if("$3 >= (jit_nint)jit_min_int && $3 <= 
(jit_nint)jit_max_int")] -> {
@@ -2044,7 +2254,7 @@
 
 JIT_OP_LNE:
        [=reg, reg, immzero] -> {
-               x86_64_or_reg_reg_size(inst, $2, $2, 8);
+               x86_64_test_reg_reg_size(inst, $2, $2, 8);
                inst = setcc_reg(inst, $1, X86_CC_NE, 0);
        }
        [=reg, reg, imm, if("$3 >= (jit_nint)jit_min_int && $3 <= 
(jit_nint)jit_max_int")] -> {
@@ -2172,6 +2382,232 @@
                inst = setcc_reg(inst, $1, X86_CC_GE, 0);
        }
 
+JIT_OP_FEQ:
+       [=reg, xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+
+JIT_OP_FNE:
+       [=reg, xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+
+JIT_OP_FLT:
+       [=reg, xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_B, 0);
+       }
+
+JIT_OP_FLE:
+       [=reg, xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_BE, 0);
+       }
+
+JIT_OP_FGT:
+       [=reg, xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_A, 0);
+       }
+
+JIT_OP_FGE:
+       [=reg, xreg, xreg] -> {
+               x86_64_comiss_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_AE, 0);
+       }
+
+JIT_OP_DEQ:
+       [=reg, xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_EQ, 0);
+       }
+
+JIT_OP_DNE:
+       [=reg, xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_NE, 0);
+       }
+
+JIT_OP_DLT:
+       [=reg, xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_B, 0);
+       }
+
+JIT_OP_DLE:
+       [=reg, xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_BE, 0);
+       }
+
+JIT_OP_DGT:
+       [=reg, xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_A, 0);
+       }
+
+JIT_OP_DGE:
+       [=reg, xreg, xreg] -> {
+               x86_64_comisd_reg_reg(inst, $2, $3);
+               inst = setcc_reg(inst, $1, X86_CC_AE, 0);
+       }
+
+JIT_OP_FSQRT:
+       [=xreg, local] -> {
+               x86_64_sqrtss_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [=xreg, xreg] -> {
+               x86_64_sqrtss_reg_reg(inst, $1, $2);
+       }
+
+JIT_OP_DSQRT:
+       [=xreg, local] -> {
+               x86_64_sqrtsd_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [=xreg, xreg] -> {
+               x86_64_sqrtsd_reg_reg(inst, $1, $2);
+       }
+
+/*
+ * Absolute, minimum, maximum, and sign.
+ */
+JIT_OP_IMAX:
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               x86_64_cmov_reg_reg_size(inst, X86_CC_LT, $1, $2, 1, 4);
+       }
+
+JIT_OP_IMAX_UN:
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               x86_64_cmov_reg_reg_size(inst, X86_CC_LT, $1, $2, 0, 4);
+       }
+
+JIT_OP_IMIN:
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               x86_64_cmov_reg_reg_size(inst, X86_CC_GT, $1, $2, 1, 4);
+       }
+
+JIT_OP_IMIN_UN:
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 4);
+               x86_64_cmov_reg_reg_size(inst, X86_CC_GT, $1, $2, 0, 4);
+       }
+
+JIT_OP_LMAX:
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               x86_64_cmov_reg_reg_size(inst, X86_CC_LT, $1, $2, 1, 8);
+       }
+
+JIT_OP_LMAX_UN:
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               x86_64_cmov_reg_reg_size(inst, X86_CC_LT, $1, $2, 0, 8);
+       }
+
+JIT_OP_LMIN:
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               x86_64_cmov_reg_reg_size(inst, X86_CC_GT, $1, $2, 1, 8);
+       }
+
+JIT_OP_LMIN_UN:
+       [reg, reg] -> {
+               x86_64_cmp_reg_reg_size(inst, $1, $2, 8);
+               x86_64_cmov_reg_reg_size(inst, X86_CC_GT, $1, $2, 0, 8);
+       }
+
+JIT_OP_FMAX:
+       [xreg, local] -> {
+               x86_64_maxss_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_maxss_reg_reg(inst, $1, $2);
+       }
+
+JIT_OP_FMIN:
+       [xreg, local] -> {
+               x86_64_minss_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_minss_reg_reg(inst, $1, $2);
+       }
+
+JIT_OP_DMAX:
+       [xreg, local] -> {
+               x86_64_maxsd_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_maxsd_reg_reg(inst, $1, $2);
+       }
+
+JIT_OP_DMIN:
+       [xreg, local] -> {
+               x86_64_minsd_reg_membase(inst, $1, X86_64_RBP, $2);
+       }
+       [xreg, xreg] -> {
+               x86_64_minsd_reg_reg(inst, $1, $2);
+       }
+
+/*
+ * Rounding
+ */
+JIT_OP_FFLOOR: more_space
+       [=xreg, local, scratch reg] -> {
+               inst = x86_64_rounds_reg_membase(inst, $1, $2, $3, 
X86_ROUND_DOWN);
+       }
+       [=xreg, xreg, scratch reg] -> {
+               inst = x86_64_rounds_reg_reg(inst, $1, $2, $3, X86_ROUND_DOWN);
+       }
+
+JIT_OP_DFLOOR: more_space
+       [=xreg, local, scratch reg] -> {
+               inst = x86_64_roundd_reg_membase(inst, $1, $2, $3, 
X86_ROUND_DOWN);
+       }
+       [=xreg, xreg, scratch reg] -> {
+               inst = x86_64_roundd_reg_reg(inst, $1, $2, $3, X86_ROUND_DOWN);
+       }
+
+JIT_OP_NFFLOOR: more_space
+       [freg, scratch reg] -> {
+               inst = x86_64_roundnf(inst, $2, X86_ROUND_DOWN);
+       }
+
+JIT_OP_FCEIL: more_space
+       [=xreg, local, scratch reg] -> {
+               inst = x86_64_rounds_reg_membase(inst, $1, $2, $3, 
X86_ROUND_UP);
+       }
+       [=xreg, xreg, scratch reg] -> {
+               inst = x86_64_rounds_reg_reg(inst, $1, $2, $3, X86_ROUND_UP);
+       }
+
+JIT_OP_DCEIL: more_space
+       [=xreg, local, scratch reg] -> {
+               inst = x86_64_roundd_reg_membase(inst, $1, $2, $3, 
X86_ROUND_UP);
+       }
+       [=xreg, xreg, scratch reg] -> {
+               inst = x86_64_roundd_reg_reg(inst, $1, $2, $3, X86_ROUND_UP);
+       }
+
+JIT_OP_NFCEIL: more_space
+       [freg, scratch reg] -> {
+               inst = x86_64_roundnf(inst, $2, X86_ROUND_UP);
+       }
+
+/*
+JIT_OP_FRINT: more_space
+       [=xreg, local, scratch reg] -> {
+               inst = x86_64_rounds_reg_membase(inst, $1, $2, $3, 
X86_ROUND_ZERO);
+       }
+       [=xreg, xreg, scratch reg] -> {
+               inst = x86_64_rounds_reg_reg(inst, $1, $2, $3, X86_ROUND_ZERO);
+       }
+*/
+
 /*
  * Pointer check opcodes.
  */




reply via email to

[Prev in Thread] Current Thread [Next in Thread]