guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] 04/07: RISC-V Support


From: Ludovic Courtès
Subject: [Guile-commits] 04/07: RISC-V Support
Date: Tue, 31 Jan 2023 10:13:07 -0500 (EST)

civodul pushed a commit to branch wip-lightening-riscv
in repository guile.

commit 2806dd865f56ea96067785b2b705ab8e2c92c13f
Author: Ekaitz Zarraga <ekaitz@elenq.tech>
AuthorDate: Sun May 9 16:39:03 2021 +0200

    RISC-V Support
---
 lightening.am           |    6 +-
 lightening.h            |    2 +
 lightening/endian.h     |    2 +
 lightening/lightening.c |   33 +
 lightening/riscv-cpu.c  | 2464 +++++++++++++++++++++++++++++++++++++++++++++++
 lightening/riscv-fpu.c  |  858 +++++++++++++++++
 lightening/riscv.c      |  327 +++++++
 lightening/riscv.h      |  194 ++++
 8 files changed, 3885 insertions(+), 1 deletion(-)

diff --git a/lightening.am b/lightening.am
index 2c9089ead..ba55f2c7f 100644
--- a/lightening.am
+++ b/lightening.am
@@ -40,6 +40,7 @@ lightening_extra_files =                              \
        $(lightening)/lightening/mips.h                 \
        $(lightening)/lightening/ppc.h                  \
        $(lightening)/lightening/x86.h                  \
+       $(lightening)/lightening/riscv.h                \
                                                        \
        $(lightening)/lightening/aarch64.c              \
        $(lightening)/lightening/aarch64-cpu.c          \
@@ -55,4 +56,7 @@ lightening_extra_files =                              \
        $(lightening)/lightening/ppc-fpu.c              \
        $(lightening)/lightening/x86.c                  \
        $(lightening)/lightening/x86-cpu.c              \
-       $(lightening)/lightening/x86-sse.c
+       $(lightening)/lightening/x86-sse.c              \
+       $(lightening)/lightening/riscv.c                \
+       $(lightening)/lightening/riscv-cpu.c            \
+       $(lightening)/lightening/riscv-fpu.c
diff --git a/lightening.h b/lightening.h
index efa5dfdf1..fd39a6406 100644
--- a/lightening.h
+++ b/lightening.h
@@ -77,6 +77,8 @@ jit_same_fprs (jit_fpr_t a, jit_fpr_t b)
 #  include "lightening/aarch64.h"
 #elif defined(__s390__) || defined(__s390x__)
 #  include "lightening/s390.h"
+#elif defined(__riscv__) || defined(__riscv)
+#  include "lightening/riscv.h"
 #endif
 
 enum jit_reloc_kind
diff --git a/lightening/endian.h b/lightening/endian.h
index 3b34a1518..e3689a117 100644
--- a/lightening/endian.h
+++ b/lightening/endian.h
@@ -38,6 +38,8 @@
 #    else
 #      define __WORDSIZE       64
 #    endif
+#  elif defined(__riscv_xlen)
+#    define __WORDSIZE __riscv_xlen            /* riscv */
 #  else                                                /* From FreeBSD 9.1 
stdint.h */
 #    if defined(UINTPTR_MAX) && defined(UINT64_MAX) && \
        (UINTPTR_MAX == UINT64_MAX)
diff --git a/lightening/lightening.c b/lightening/lightening.c
index ad990eb01..593429dcd 100644
--- a/lightening/lightening.c
+++ b/lightening/lightening.c
@@ -269,6 +269,22 @@ get_temp_gpr(jit_state_t *_jit)
 #ifdef JIT_TMP1
     case 1:
       return JIT_TMP1;
+#endif
+#ifdef JIT_TMP2
+    case 2:
+      return JIT_TMP2;
+#endif
+#ifdef JIT_TMP3
+    case 3:
+      return JIT_TMP3;
+#endif
+#ifdef JIT_TMP4
+    case 4:
+      return JIT_TMP4;
+#endif
+#ifdef JIT_TMP5
+    case 5:
+      return JIT_TMP5;
 #endif
     default:
       abort();
@@ -559,6 +575,8 @@ jit_emit_addr(jit_state_t *j)
 # include "aarch64.c"
 #elif defined(__s390__) || defined(__s390x__)
 # include "s390.c"
+#elif defined(__riscv__) || defined(__riscv)
+# include "riscv.c"
 #endif
 
 #define JIT_IMPL_0(stem, ret) \
@@ -1156,6 +1174,9 @@ static const jit_gpr_t user_callee_save_gprs[] = {
 #endif
 #ifdef JIT_V9
   , JIT_V9
+#endif
+#ifdef JIT_V10
+  , JIT_V10
 #endif
  };
 
@@ -1184,6 +1205,18 @@ static const jit_fpr_t user_callee_save_fprs[] = {
 #ifdef JIT_VF7
   , JIT_VF7
 #endif
+#ifdef JIT_VF8
+  , JIT_VF8
+#endif
+#ifdef JIT_VF9
+  , JIT_VF9
+#endif
+#ifdef JIT_VF10
+  , JIT_VF10
+#endif
+#ifdef JIT_VF11
+  , JIT_VF11
+#endif
 };
 
 #define ARRAY_SIZE(X) (sizeof (X)/sizeof ((X)[0]))
diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c
new file mode 100644
index 000000000..37c252a78
--- /dev/null
+++ b/lightening/riscv-cpu.c
@@ -0,0 +1,2464 @@
+/*
+ * Copyright (C) 2012-2021  Free Software Foundation, Inc.
+ *
+ * This file is part of GNU lightning.
+ *
+ * GNU lightning is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU lightning is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for more details.
+ *
+ * Authors:
+ *      Paulo Cesar Pereira de Andrade
+ *      Ekaitz Zarraga <ekaitz@elenq.tech>
+ */
+#define stack_framesize   (200 + 64)
+#define simm6_p(im)       ((im) <= 31 && (im) >= -32)
+#define simm12_p(im)      ((im) <= 2047 && (im) >= -2048)
+#define simm20_p(im)      ((im) <= 524287 && (im) >= -524288)
+#define simm32_p(im)      ((im) <= 2147483647LL && (im) >= -2147483648LL)
+
+typedef union {
+  struct  {
+    uint32_t opcode     : 7;
+    uint32_t rd         : 5;
+    uint32_t funct3     : 3;
+    uint32_t rs1        : 5;
+    uint32_t rs2        : 5;
+    uint32_t funct7     : 7;
+  } R;
+  struct  {
+    uint32_t opcode     : 7;
+    uint32_t rd         : 5;
+    uint32_t funct3     : 3;
+    uint32_t rs1        : 5;
+    uint32_t rs2        : 5;
+    uint32_t rl         : 1;
+    uint32_t aq         : 1;
+    uint32_t funct5     : 5;
+  } R4;
+  struct  {
+    uint32_t opcode     : 7;
+    uint32_t rd         : 5;
+    uint32_t funct3     : 3;
+    uint32_t rs1        : 5;
+    uint32_t imm11_0    : 12;
+  } I;
+#if __WORDSIZE == 64
+  struct  {
+    uint32_t opcode     : 7;
+    uint32_t rd         : 5;
+    uint32_t funct3     : 3;
+    uint32_t rs1        : 5;
+    uint32_t shamt      : 6;
+    uint32_t imm6_0     : 6;
+  } IS;
+#endif
+  struct  {
+    uint32_t opcode     : 7;
+    uint32_t imm4_0     : 5;
+    uint32_t funct3     : 3;
+    uint32_t rs1        : 5;
+    uint32_t rs2        : 5;
+    uint32_t imm11_5    : 7;
+  } S;
+  struct  {
+    uint32_t opcode     : 7;
+    uint32_t imm11      : 1;
+    uint32_t imm4_1     : 4;
+    uint32_t funct3     : 3;
+    uint32_t rs1        : 5;
+    uint32_t rs2        : 5;
+    uint32_t imm10_5    : 6;
+    uint32_t imm12      : 1;
+  } B;
+  struct  {
+    uint32_t opcode     : 7;
+    uint32_t rd         : 5;
+    uint32_t imm31_12   : 20;
+  } U;
+  struct  {
+    uint32_t opcode     : 7;
+    uint32_t rd         : 5;
+    uint32_t imm19_12   : 8;
+    uint32_t imm11      : 1;
+    uint32_t imm10_1    : 10;
+    uint32_t imm20      : 1;
+  } J;
+  uint32_t w;
+} instr_t;
+
+
+// TODO: Compressed instruction support
+
+static uint32_t
+Rtype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t rs2,
+      int32_t fct2)
+{
+  instr_t     i;
+  assert(!(op   & ~0x7f));
+  assert(!(rd   & ~0x1f));
+  assert(!(fct  & ~0x07));
+  assert(!(rs1  & ~0x1f));
+  assert(!(rs2  & ~0x1f));
+  assert(!(fct2 & ~0x7f));
+  i.R.opcode  = op;
+  i.R.rd      = rd;
+  i.R.funct3  = fct;
+  i.R.rs1     = rs1;
+  i.R.rs2     = rs2;
+  i.R.funct7  = fct2;
+  return i.w;
+}
+
+static uint32_t
+R4type(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t rs2,
+       int32_t aq, int32_t rl, int32_t fct5)
+{
+  instr_t     i;
+  assert(!(op   & ~0x7f));
+  assert(!(rd   & ~0x1f));
+  assert(!(fct  & ~0x07));
+  assert(!(rs1  & ~0x1f));
+  assert(!(rs2  & ~0x1f));
+  assert(!(fct5 & ~0x1f));
+  assert(!(aq   & ~0x01));
+  assert(!(rl   & ~0x01));
+  i.R4.opcode = op;
+  i.R4.rd     = rd;
+  i.R4.funct3 = fct;
+  i.R4.rs1    = rs1;
+  i.R4.rs2    = rs2;
+  i.R4.aq     = aq;
+  i.R4.rl     = rl;
+  i.R4.funct5 = fct5;
+  return i.w;
+}
+
+static uint32_t
+Itype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t imm)
+{
+  instr_t     i;
+  assert(!(op  &  ~0x7f));
+  assert(!(rd  &  ~0x1f));
+  assert(!(fct &  ~0x07));
+  assert(!(rs1 &  ~0x1f));
+  assert(simm12_p(imm));
+  i.I.opcode  = op;
+  i.I.rd      = rd;
+  i.I.funct3  = fct;
+  i.I.rs1     = rs1;
+  i.I.imm11_0 = imm;
+  return i.w;
+}
+
+#  if __WORDSIZE == 64
+  static uint32_t
+IStype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t sh,
+       int32_t imm)
+{
+  instr_t     i;
+  assert(!(op  &  ~0x7f));
+  assert(!(rd  &  ~0x1f));
+  assert(!(fct &  ~0x07));
+  assert(!(rs1 &  ~0x1f));
+  assert(!(sh  &  ~0x3f));
+  assert(simm6_p(imm));
+  i.IS.opcode = op;
+  i.IS.rd     = rd;
+  i.IS.funct3 = fct;
+  i.IS.rs1    = rs1;
+  i.IS.shamt  = sh;
+  i.IS.imm6_0 = imm;
+  return i.w;
+}
+#  endif
+
+static uint32_t
+Stype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm)
+{
+  instr_t     i;
+  assert(!(op  &  ~0x7f));
+  assert(!(fct &  ~0x07));
+  assert(!(rs1 &  ~0x1f));
+  assert(!(rs2 &  ~0x1f));
+  assert(simm12_p(imm));
+  i.S.opcode  = op;
+  i.S.imm4_0  = imm & 0x1f;
+  i.S.funct3  = fct;
+  i.S.rs1     = rs1;
+  i.S.rs2     = rs2;
+  i.S.imm11_5 = (imm >> 5) & 0x7f;
+  return i.w;
+}
+
+static uint32_t
+Btype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm)
+{
+  instr_t     i;
+  assert(!(op  & ~0x7f));
+  assert(!(fct & ~0x07));
+  assert(!(rs1 & ~0x1f));
+  assert(!(rs2 & ~0x1f));
+  assert(!(imm & 1) && simm12_p(imm));
+  i.B.opcode  = op;
+  i.B.imm11   = (imm >> 11) & 0x1;
+  i.B.imm4_1  = (imm >>  1) & 0xf;
+  i.B.funct3  = fct;
+  i.B.rs1     = rs1;
+  i.B.rs2     = rs2;
+  i.B.imm10_5 = (imm >>  5) & 0x3f;
+  i.B.imm12   = (imm >> 12) & 0x1;
+  return i.w;
+}
+
+static uint32_t
+Utype(int32_t op, int32_t rd, int32_t imm)
+{
+  instr_t     i;
+  assert(!(op & ~0x7f));
+  assert(!(rd & ~0x1f));
+  assert(simm20_p(imm));
+  i.U.opcode  = op;
+  i.U.rd      = rd;
+  i.U.imm31_12= imm;
+  return i.w;
+}
+
+static uint32_t
+Jtype(int32_t op, int32_t rd, int32_t imm)
+{
+  instr_t     i;
+  assert(!(op & ~0x7f));
+  assert(!(rd & ~0x1f));
+  assert(!(imm & 1) && imm <= 1048575 && imm >= -1048576);
+  i.J.opcode  = op;
+  i.J.rd      = rd;
+  i.J.imm19_12= (imm >> 12) &  0xff;
+  i.J.imm11   = (imm >> 11) &   0x1;
+  i.J.imm10_1 = (imm >>  1) & 0x3ff;
+  i.J.imm20   = (imm >> 20) &   0x1;
+  return i.w;
+}
+
+/*
+ * RV32I Base Instruction Set
+ */
+#define _LUI(rd, imm)              Utype(55, rd, imm)
+#define _AUIPC(rd, imm)            Utype(23, rd, imm)
+#define _JAL(rd, imm)              Jtype(111, rd, imm)
+#define _JALR(rd, rs1, imm)        Itype(103, rd, 0, rs1, imm)
+#define _BEQ(rs1, rs2, imm)        Btype(99, 0, rs1, rs2, imm)
+#define _BNE(rs1, rs2, imm)        Btype(99, 1, rs1, rs2, imm)
+#define _BLT(rs1, rs2, imm)        Btype(99, 4, rs1, rs2, imm)
+#define _BGE(rs1, rs2, imm)        Btype(99, 5, rs1, rs2, imm)
+#define _BLTU(rs1, rs2, imm)       Btype(99, 6, rs1, rs2, imm)
+#define _BGEU(rs1, rs2, imm)       Btype(99, 7, rs1, rs2, imm)
+#define _LB(rd, rs1, imm)          Itype(3, rd, 0, rs1, imm)
+#define _LH(rd, rs1, imm)          Itype(3, rd, 1, rs1, imm)
+#define _LW(rd, rs1, imm)          Itype(3, rd, 2, rs1, imm)
+#define _LBU(rd, rs1, imm)         Itype(3, rd, 4, rs1, imm)
+#define _LHU(rd, rs1, imm)         Itype(3, rd, 5, rs1, imm)
+#define _SB(rs1, rs2, imm)         Stype(35, 0, rs1, rs2, imm)
+#define _SH(rs1, rs2, imm)         Stype(35, 1, rs1, rs2, imm)
+#define _SW(rs1, rs2, imm)         Stype(35, 2, rs1, rs2, imm)
+#define _ADDI(rd, rs1, imm)        Itype(19, rd, 0, rs1, imm)
+#define _SLTI(rd, rs1, imm)        Itype(19, rd, 2, rs1, imm)
+#define _SLTIU(rd, rs1, imm)       Itype(19, rd, 3, rs1, imm)
+#define _XORI(rd, rs1, imm)        Itype(19, rd, 4, rs1, imm)
+#define _ORI(rd, rs1, imm)         Itype(19, rd, 6, rs1, imm)
+#define _ANDI(rd, rs1, imm)        Itype(19, rd, 7, rs1, imm)
+#if __WORDSIZE == 32
+#  define _SLLI(rd, rs1, imm)      Rtype(19, rd, 1, rs1, imm,  0)
+#  define _SRLI(rd, rs1, imm)      Rtype(19, rd, 5, rs1, imm,  0)
+#  define _SRAI(rd, rs1, imm)      Rtype(19, rd, 5, rs1, imm, 32)
+#endif
+#define _ADD(rd, rs1, rs2)         Rtype(51, rd, 0, rs1, rs2,  0)
+#define _SUB(rd, rs1, rs2)         Rtype(51, rd, 0, rs1, rs2, 32)
+#define _SLL(rd, rs1, rs2)         Rtype(51, rd, 1, rs1, rs2,  0)
+#define _SLT(rd, rs1, rs2)         Rtype(51, rd, 2, rs1, rs2,  0)
+#define _SLTU(rd, rs1, rs2)        Rtype(51, rd, 3, rs1, rs2,  0)
+#define _XOR(rd, rs1, rs2)         Rtype(51, rd, 4, rs1, rs2,  0)
+#define _SRL(rd, rs1, rs2)         Rtype(51, rd, 5, rs1, rs2,  0)
+#define _SRA(rd, rs1, rs2)         Rtype(51, rd, 5, rs1, rs2, 32)
+#define _OR(rd, rs1, rs2)          Rtype(51, rd, 6, rs1, rs2,  0)
+#define _AND(rd, rs1, rs2)         Rtype(51, rd, 7, rs1, rs2,  0)
+#define _FENCE(imm)                Itype( 15,  0, 0,    0, imm)
+#define _FENCE_I(imm)              Itype( 15,  0, 1,    0, imm)
+#define _ECALL()                   Itype(115,  0, 0,    0,   0)
+#define _EBREAK()                  Itype(115,  0, 0,    0,   1)
+#define _CSRRW(rd, rs1, csr)       Itype(115, rd, 1,  rs1, csr)
+#define _CSRRS(rd, rs1, csr)       Itype(115, rd, 2,  rs1, csr)
+#define _CSRRC(rd, rs1, csr)       Itype(115, rd, 3,  rs1, csr)
+#define _CSRRWI(rd, zimm, csr)     Itype(115, rd, 5, zimm, csr)
+#define _CSRRSI(rd, zimm, csr)     Itype(115, rd, 6, zimm, csr)
+#define _CSRRCI(rd, zimm, csr)     Itype(115, rd, 7, zimm, csr)
+/*
+ * RV64I Base Instruction Set (in addition to RV32I)
+ */
+#define _LWU(rd, rs1, imm)         Itype(3, rd, 6, rs1, imm)
+#define _LD(rd, rs1, imm)          Itype(3, rd, 3, rs1, imm)
+#define _SD(rs1, rs2, imm)         Stype(35, 3, rs1, rs2, imm)
+#if __WORDSIZE == 64
+#  define _SLLI(rd, rs1, sh)       IStype(19, rd, 1, rs1, sh,  0)
+#  define _SRLI(rd, rs1, sh)       IStype(19, rd, 5, rs1, sh,  0)
+#  define _SRAI(rd, rs1, sh)       IStype(19, rd, 5, rs1, sh, 16)
+#endif
+#define _ADDIW(rd, rs1, imm)       Itype(27, rd, 0, rs1, imm)
+#define _SLLIW(rd, rs1, imm)       Rtype(27, rd, 1, rs1, imm,  0)
+#define _SRLIW(rd, rs1, imm)       Rtype(27, rd, 3, rs1, imm,  0)
+#define _SRAIW(rd, rs1, imm)       Rtype(27, rd, 3, rs1, imm, 32)
+#define _ADDW(rd, rs1, imm)        Rtype(59, rd, 0, rs1, imm,  0)
+#define _SUBW(rd, rs1, imm)        Rtype(59, rd, 0, rs1, imm, 32)
+#define _SLLW(rd, rs1, imm)        Rtype(59, rd, 1, rs1, imm,  0)
+#define _SRLW(rd, rs1, imm)        Rtype(59, rd, 5, rs1, imm,  0)
+#define _SRAW(rd, rs1, imm)        Rtype(59, rd, 5, rs1, imm, 32)
+/*
+ * RV32M Standard Extension
+ */
+#define _MUL(rd, rs1, rs2)         Rtype(51, rd, 0, rs1, rs2, 1)
+#define _MULH(rd, rs1, rs2)        Rtype(51, rd, 1, rs1, rs2, 1)
+#define _MULHSU(rd, rs1, rs2)      Rtype(51, rd, 2, rs1, rs2, 1)
+#define _MULHU(rd, rs1, rs2)       Rtype(51, rd, 3, rs1, rs2, 1)
+#define _DIV(rd, rs1, rs2)         Rtype(51, rd, 4, rs1, rs2, 1)
+#define _DIVU(rd, rs1, rs2)        Rtype(51, rd, 5, rs1, rs2, 1)
+#define _REM(rd, rs1, rs2)         Rtype(51, rd, 6, rs1, rs2, 1)
+#define _REMU(rd, rs1, rs2)        Rtype(51, rd, 7, rs1, rs2, 1)
+/*
+ * RV64M Standard Extension (in addition to RV32M)
+ */
+#define _MULW(rd, rs1, rs2)        Rtype(59, rd, 0, rs1, rs2, 1)
+#define _DIVW(rd, rs1, rs2)        Rtype(59, rd, 4, rs1, rs2, 1)
+#define _DIVUW(rd, rs1, rs2)       Rtype(59, rd, 5, rs1, rs2, 1)
+#define _REMW(rd, rs1, rs2)        Rtype(59, rd, 6, rs1, rs2, 1)
+#define _REMUW(rd, rs1, rs2)       Rtype(59, rd, 7, rs1, rs2, 1)
+/*
+ * RV32A Standard Extension
+ */
+#define _LR_W(rd, rs1, rl, aq)             R4type(47, rd, 2, rs1,   0, rl, aq, 
 2)
+#define _SC_W(rd, rs1, rs2, rl, aq)        R4type(47, rd, 2, rs1, rs2, rl, aq, 
 3)
+#define _AMOSWAP_W(rd, rs1, rs2, rl, aq)   R4type(47, rd, 2, rs1, rs2, rl, aq, 
 1)
+#define _AMOADD_W(rd, rs1, rs2, rl, aq)    R4type(47, rd, 2, rs1, rs2, rl, aq, 
 0)
+#define _AMOXOR_W(rd, rs1, rs2, rl, aq)    R4type(47, rd, 2, rs1, rs2, rl, aq, 
 4)
+#define _AMOAND_W(rd, rs1, rs2, rl, aq)    R4type(47, rd, 2, rs1, rs2, rl, aq, 
12)
+#define _AMOOR_W(rd, rs1, rs2, rl, aq)     R4type(47, rd, 2, rs1, rs2, rl, aq, 
 8)
+#define _AMOMIN_W(rd, rs1, rs2, rl, aq)    R4type(47, rd, 2, rs1, rs2, rl, aq, 
16)
+#define _AMOMAX_W(rd, rs1, rs2, rl, aq)    R4type(47, rd, 2, rs1, rs2, rl, aq, 
20)
+#define _AMOMINU_W(rd, rs1, rs2, rl, aq)   R4type(47, rd, 2, rs1, rs2, rl, aq, 
24)
+#define _AMOMAXU_W(rd, rs1, rs2, rl, aq)   R4type(47, rd, 2, rs1, rs2, rl, aq, 
28)
+/*
+ * RV64A Standard Extension (in addition to RV32A)
+ */
+#define _LR_D(rd, rs1, rl, aq)             R4type(47, rd, 3, rs1,   0, rl, aq, 
 2)
+#define _SC_D(rd, rs1, rs2, rl, aq)        R4type(47, rd, 3, rs1, rs2, rl, aq, 
 3)
+#define _AMOSWAP_D(rd, rs1, rs2, rl, aq)   R4type(47, rd, 3, rs1, rs2, rl, aq, 
 1)
+#define _AMOADD_D(rd, rs1, rs2, rl, aq)    R4type(47, rd, 3, rs1, rs2, rl, aq, 
 0)
+#define _AMOXOR_D(rd, rs1, rs2, rl, aq)    R4type(47, rd, 3, rs1, rs2, rl, aq, 
 4)
+#define _AMOAND_D(rd, rs1, rs2, rl, aq)    R4type(47, rd, 3, rs1, rs2, rl, aq, 
12)
+#define _AMOOR_D(rd, rs1, rs2, rl, aq)     R4type(47, rd, 3, rs1, rs2, rl, aq, 
 8)
+#define _AMOMIN_D(rd, rs1, rs2, rl, aq)    R4type(47, rd, 3, rs1, rs2, rl, aq, 
16)
+#define _AMOMAX_D(rd, rs1, rs2, rl, aq)    R4type(47, rd, 3, rs1, rs2, rl, aq, 
20)
+#define _AMOMINU_D(rd, rs1, rs2, rl, aq)   R4type(47, rd, 3, rs1, rs2, rl, aq, 
24)
+#define _AMOMAXU_D(rd, rs1, rs2, rl, aq)   R4type(47, rd, 3, rs1, rs2, rl, aq, 
28)
+/*
+ * Pseudo Instructions
+ */
+#define _NOP()                      _ADDI((jit_gpr_regno(_ZERO)),\
+                                          (jit_gpr_regno(_ZERO)), 0)
+#define _MV(r0, r1)                _ADDI(r0, r1, 0)
+#define _NOT(r0, r1)               _XORI(r0, r1, -1)
+#define _NEG(r0, r1)               _SUB(r0, (jit_gpr_regno(_ZERO)), r1)
+#define _NEGW(r0, r1)              _SUBW(r0, (jit_gpr_regno(_ZERO)), r1)
+#define _SEXT_W(r0, r1)            _ADDIW(r0, r1, 0)
+#define _RET()                     _JALR((jit_gpr_regno(_ZERO)),\
+                                         (jit_gpr_regno(_RA)), 0)
+
+
+
+// Help to make all easier
+#define em_wp(jit, inst)          emit_u32_with_pool(jit, inst)
+
+/*
+ * JIT INSTRUCTIONS
+ */
+
+// Binary ALU operations
+static void addr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void addi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void addcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void addci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void addxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void addxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+
+static void subr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void subi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void subcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void subci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void subxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void subxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+
+static void muli(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void mulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+
+static void divr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void divi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void divr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void divi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+
+static void remi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void remr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void remi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void remr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+
+static void andr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void andi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void orr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void ori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void xorr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void xori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void lshi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void lshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void rshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void rshi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0);
+static void rshr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void rshi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0);
+
+
+// Four operand ALU operations
+static void qmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, 
int32_t r3);
+static void qmulr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, 
int32_t r3);
+static void qmuli(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, 
jit_word_t i0);
+static void qmuli_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, 
jit_word_t i0);
+
+static void qdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, 
int32_t r3);
+static void qdivr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, 
int32_t r3);
+static void qdivi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, 
jit_word_t i0);
+static void qdivi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, 
jit_word_t i0);
+
+
+// Unary ALU operations
+static void negr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void comr(jit_state_t *_jit, int32_t r0, int32_t r1);
+
+
+// Transfer operations
+static void movr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void movi(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+
+static uint64_t patch_load_from_pool(uint64_t instrs, uint32_t off);
+static jit_reloc_t emit_load_from_pool(jit_state_t *_jit, uint64_t insts);
+static jit_reloc_t mov_addr(jit_state_t *_jit, int32_t r0);
+static jit_reloc_t movi_from_pool(jit_state_t *_jit, int32_t r0);
+
+static void extr_c(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void extr_uc(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void extr_s(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void extr_us(jit_state_t *_jit, int32_t r0, int32_t r1);
+
+#  if __WORDSIZE == 64
+static void extr_i(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void extr_ui(jit_state_t *_jit, int32_t r0, int32_t r1);
+#endif
+
+
+// Branch instructions
+static uint32_t patch_cc_jump(uint32_t inst, int32_t offset);
+static jit_reloc_t emit_cc_jump(jit_state_t *_jit, uint32_t inst);
+
+static jit_reloc_t bltr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t blti(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bltr_u(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t blti_u(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bler(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t blei(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bler_u(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t blei_u(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t beqr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t beqi(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bger(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bgei(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bger_u(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bgei_u(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bgtr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bgti(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bgtr_u(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bgti_u(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bner(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bnei(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+
+static jit_reloc_t bmsr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bmsi(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bmcr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bmci(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t boaddr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t boaddi(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t boaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t boaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bxaddr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bxaddi(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bxaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bxaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bosubr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bosubi(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bosubr_u(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bosubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bxsubr(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bxsubi(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+static jit_reloc_t bxsubr_u(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bxsubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1);
+
+
+// Store operations
+static void str_c(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void str_uc(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void str_s(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void str_i(jit_state_t *_jit, int32_t r0, int32_t r1);
+#if __WORDSIZE == 64
+static void str_i(jit_state_t *_jit, int32_t r0, int32_t r1);
+#endif
+
+static void sti_c(jit_state_t *_jit, jit_word_t i0, int32_t r0);
+static void sti_s(jit_state_t *_jit, jit_word_t i0, int32_t r0);
+static void sti_i(jit_state_t *_jit, jit_word_t i0, int32_t r0);
+#if __WORDSIZE == 64
+static void sti_l(jit_state_t *_jit, jit_word_t i0, int32_t r0);
+#endif
+
+static void stxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void stxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void stxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+# if __WORDSIZE == 64
+static void stxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+#endif
+
+static void stxi_c(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1);
+static void stxi_s(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1);
+static void stxi_i(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1);
+# if __WORDSIZE == 64
+static void stxi_l(jit_state_t *_jit,jit_word_t i0,int32_t r0,int32_t r1);
+# endif
+
+
+// Load operations
+static void ldr_c(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void ldr_uc(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void ldr_s(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void ldr_us(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void ldr_i(jit_state_t *_jit, int32_t r0, int32_t r1);
+# if __WORDSIZE == 64
+static void ldr_ui(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void ldr_l(jit_state_t *_jit, int32_t r0, int32_t r1);
+# endif
+
+static void ldi_c(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+static void ldi_uc(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+static void ldi_s(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+static void ldi_us(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+static void ldi_i(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+# if __WORDSIZE == 64
+static void ldi_ui(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+static void ldi_l(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+# endif
+
+static void ldxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void ldxr_uc(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void ldxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void ldxr_us(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void ldxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+#  if __WORDSIZE == 64
+static void ldxr_ui(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void ldxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+#endif
+
+static void ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+#  if __WORDSIZE == 64
+static void ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void ldxi_l(jit_state_t *_jit,int32_t r0,int32_t r1,jit_word_t i0);
+#endif
+
+
+// Argument management
+//static void pushr(jit_state_t *_jit, int32_t r0);
+//static void popr(jit_state_t *_jit, int32_t r0);
+static void ret(jit_state_t *_jit);
+static void retr(jit_state_t *_jit, int32_t r0);
+static void reti(jit_state_t *_jit, jit_word_t i0);
+static void retval_c(jit_state_t *_jit, int32_t r0);
+static void retval_uc(jit_state_t *_jit, int32_t r0);
+static void retval_s(jit_state_t *_jit, int32_t r0);
+static void retval_us(jit_state_t *_jit, int32_t r0);
+static void retval_i(jit_state_t *_jit, int32_t r0);
+#  if __WORDSIZE == 64
+static void retval_ui(jit_state_t *_jit, int32_t r0);
+static void retval_l(jit_state_t *_jit, int32_t r0);
+#endif
+
+// Jump and return
+static uint32_t patch_jump(uint32_t inst, int32_t offset);
+static jit_reloc_t emit_jump(jit_state_t *_jit, uint32_t inst);
+
+static void callr(jit_state_t *_jit, int32_t r0);
+static void calli(jit_state_t *_jit, jit_word_t i0);
+static void jmpi_with_link(jit_state_t *_jit, jit_word_t i0);
+static void pop_link_register(jit_state_t *_jit);
+static void push_link_register(jit_state_t *_jit);
+static void jmpr(jit_state_t *_jit, int32_t r0);
+static void jmpi(jit_state_t *_jit, jit_word_t i0);
+static jit_reloc_t jmp(jit_state_t *_jit);
+
+
+// Atomic operations
+static void ldr_atomic(jit_state_t *_jit, int32_t dst, int32_t loc);
+static void str_atomic(jit_state_t *_jit, int32_t loc, int32_t val);
+static void swap_atomic(jit_state_t *_jit, int32_t dst, int32_t loc,
+    int32_t val);
+static void cas_atomic(jit_state_t *_jit, int32_t dst, int32_t loc,
+    int32_t expected, int32_t desired);
+
+// Byte swapping operations
+static void bswapr_us(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void bswapr_ui(jit_state_t *_jit, int32_t r0, int32_t r1);
+#  if __WORDSIZE == 64
+static void
+bswapr_ul(jit_state_t *_jit, int32_t r0, int32_t r1);
+#endif
+
+// Others
+static void nop(jit_state_t *_jit, int32_t im);
+static void mfence(jit_state_t *_jit);
+static void breakpoint(jit_state_t *_jit);
+
+
+
+/*
+ * Binary ALU operations
+ */
+static void
+addr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _ADD(r0, r1, r2));
+}
+static void
+addi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0)){
+    em_wp(_jit, _ADDI(r0, r1, i0));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    addr(_jit, r0, r1, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+static void
+addcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  // TODO: Not sure if this is correct
+  jit_gpr_t t0;
+  if (r0 == r1) {
+    t0 = get_temp_gpr(_jit);
+    addr(_jit, jit_gpr_regno(t0), r1, r2);
+    em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), jit_gpr_regno(t0), r1));
+    movr(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+  else {
+    addr(_jit, r0, r1, r2);
+    em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r0, r1));
+  }
+}
+
+static void
+addci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  jit_gpr_t t0;
+  if (r0 == r1) {
+    t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), jit_gpr_regno(t0), r1));
+    movr(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+  else {
+    addi(_jit, r0, r1, i0);
+    em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r0, r1));
+  }
+}
+
+static void
+addxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0;
+  t0 = get_temp_gpr(_jit);
+  movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY));
+  addcr(_jit, r0, r1, r2);
+  addcr(_jit, r0, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+addxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  jit_gpr_t t0;
+  t0 = get_temp_gpr(_jit);
+  movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY));
+  addci(_jit, r0, r1, i0);
+  addcr(_jit, r0, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+subr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _SUB(r0, r1, r2));
+}
+
+static void
+subi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  addi(_jit, r0, r1, -i0);
+}
+
+static void
+subcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+
+  jit_gpr_t t0;
+  if (r0 == r1) {
+    t0 = get_temp_gpr(_jit);
+    subr(_jit, jit_gpr_regno(t0), r1, r2);
+    em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, jit_gpr_regno(t0)));
+    movr(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+  else {
+    addr(_jit, r0, r1, r2);
+    em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, r0));
+  }
+}
+
+static void
+subci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+
+  jit_gpr_t t0;
+  if (r0 == r1) {
+    t0 = get_temp_gpr(_jit);
+    subi(_jit, jit_gpr_regno(t0), r1, i0);
+    em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, jit_gpr_regno(t0)));
+    movr(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+  else {
+    addi(_jit, r0, r1, i0);
+    em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, r0));
+  }
+}
+
+static void
+subxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0;
+  t0 = get_temp_gpr(_jit);
+  movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY));
+  subcr(_jit, r0, r1, r2);
+  subcr(_jit, r0, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+subxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  jit_gpr_t t0;
+  t0 = get_temp_gpr(_jit);
+  movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY));
+  subci(_jit, r0, r1, i0);
+  subcr(_jit, r0, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+muli(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  mulr(_jit, r0, r1,  jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+mulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _MUL(r0, r1, r2));
+}
+
+static void
+divr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _DIV(r0, r1, r2));
+}
+
+static void
+divi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  divr(_jit, r0, r1,  jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+divr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _DIVU(r0, r1, r2));
+}
+
+static void
+divi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  divr_u(_jit, r0, r1,  jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+remi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  remr(_jit, r0, r1,  jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+remr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _REM(r0, r1, r2));
+}
+static void
+remi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  remr_u(_jit, r0, r1,  jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+remr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _REMU(r0, r1, r2));
+}
+
+static void
+andr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _AND(r0, r1, r2));
+}
+
+static void
+andi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0)){
+    em_wp(_jit, _ANDI(r0, r1, i0));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    em_wp(_jit, _AND(r0, r1, jit_gpr_regno(t0)));
+    unget_temp_gpr(_jit);
+  }
+}
+
+static void
+orr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _OR(r0, r1, r2));
+}
+
+static void
+ori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0)){
+    em_wp(_jit, _ORI(r0, r1, i0));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    orr(_jit, r0, r1, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+static void
+xorr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _XOR(r0, r1, r2));
+}
+
+static void
+xori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0)){
+    em_wp(_jit, _XORI(r0, r1, i0));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    xorr(_jit, r0, r1, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+static void
+lshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _SLL(r0, r1, r2));
+}
+
+static void
+lshi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0)){
+    em_wp(_jit, _SLLI(r0, r1, i0));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    lshr(_jit, r0, r1, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+static void
+rshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _SRA(r0, r1, r2));
+}
+
+static void
+rshi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0)
+{
+  if (simm12_p(i0)){
+    em_wp(_jit, _SRAI(r0, r1, i0));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    rshr(_jit, r0, r1, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+static void
+rshr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _SRL(r0, r1, r2));
+}
+
+static void
+rshi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0)
+{
+  if (simm12_p(i0)){
+    em_wp(_jit, _SRLI(r0, r1, i0));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    rshr_u(_jit, r0, r1, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+
+/*
+ * Four operand ALU operations
+ */
+static void
+iqmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3,
+    jit_bool_t sign){
+  if(r0 == r2 || r0 == r3){
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    em_wp(_jit, _MUL(jit_gpr_regno(t0), r2, r3));
+    if(sign)
+      em_wp(_jit, _MULH(r1, r2, r3));
+    else
+      em_wp(_jit, _MULHU(r1, r2, r3));
+    movr(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+  em_wp(_jit, _MUL(r0, r2, r3));
+  if(sign)
+    em_wp(_jit, _MULH(r1, r2, r3));
+  else
+    em_wp(_jit, _MULHU(r1, r2, r3));
+}
+
+static void
+qmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3)
+{
+  iqmulr(_jit, r0, r1, r2, r3, 1);
+}
+
+static void
+qmulr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3)
+{
+  iqmulr(_jit, r0, r1, r2, r3, 0);
+}
+
+static void
+qmuli(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  iqmulr(_jit, r0, r1, r2, jit_gpr_regno(t0), 1);
+  unget_temp_gpr(_jit);
+}
+
+static void
+qmuli_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  iqmulr(_jit, r0, r1, r2, jit_gpr_regno(t0), 0);
+  unget_temp_gpr(_jit);
+}
+
+static void
+iqdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3,
+    jit_bool_t sign){
+  if(r0 == r2 || r0 == r3){
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    if(sign){
+      em_wp(_jit, _DIV(jit_gpr_regno(t0), r2, r3));
+      em_wp(_jit, _REM(r1, r2, r3));
+    } else {
+      em_wp(_jit, _DIVU(jit_gpr_regno(t0), r2, r3));
+      em_wp(_jit, _REMU(r1, r2, r3));
+    }
+    movr(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+  if(sign){
+    em_wp(_jit, _DIV(r0, r2, r3));
+    em_wp(_jit, _REM(r1, r2, r3));
+  } else {
+    em_wp(_jit, _DIVU(r0, r2, r3));
+    em_wp(_jit, _REMU(r1, r2, r3));
+  }
+}
+
+static void
+qdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3)
+{
+  iqdivr(_jit, r0, r1, r2, r3, 1);
+}
+
+static void
+qdivr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3)
+{
+  iqdivr(_jit, r0, r1, r2, r3, 0);
+}
+
+static void
+qdivi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  iqdivr(_jit, r0, r1, r2, jit_gpr_regno(t0), 1);
+  unget_temp_gpr(_jit);
+}
+
+static void
+qdivi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  iqdivr(_jit, r0, r1, r2, jit_gpr_regno(t0), 0);
+  unget_temp_gpr(_jit);
+}
+
+
+/*
+ * Unary ALU operations
+ */
+static void
+negr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _NEG(r0, r1));
+}
+
+static void
+comr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _NOT(r0, r1));
+}
+
+
+/*
+ * Branch instructions
+ */
+static uint32_t
+patch_cc_jump(uint32_t inst, int32_t offset){
+  instr_t i;
+  i.w = inst;
+  i.B.imm11   = (offset >> 11) & 0x1;
+  i.B.imm4_1  = (offset >>  1) & 0xf;
+  i.B.imm10_5 = (offset >>  5) & 0x3f;
+  i.B.imm12   = (offset >> 12) & 0x1;
+  return i.w;
+}
+
+static jit_reloc_t
+emit_cc_jump(jit_state_t *_jit, uint32_t inst)
+{
+  while (1) {
+    uint8_t *pc_base = _jit->pc.uc;   // Offset is from current PC
+    int32_t off = (uint8_t*)jit_address(_jit) - pc_base;
+    jit_reloc_t ret =
+      jit_reloc (_jit, JIT_RELOC_JCC_WITH_VENEER, 0, _jit->pc.uc, pc_base, 0);
+    uint8_t cc_jump_width = 12;
+    if (add_pending_literal(_jit, ret, cc_jump_width - 1)) {
+      em_wp(_jit, patch_cc_jump(inst, off));
+      return ret;
+    }
+  }
+}
+
+static jit_reloc_t
+bltr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return emit_cc_jump(_jit, _BLT(r0, r1, 0));
+}
+
+static jit_reloc_t
+blti(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bltr(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bltr_u(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return emit_cc_jump(_jit, _BLTU(r0, r1, 0));
+}
+
+static jit_reloc_t
+blti_u(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bltr_u(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bler(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return emit_cc_jump(_jit, _BGE(r1, r0, 0));
+}
+
+static jit_reloc_t
+blei(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bler(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bler_u(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return emit_cc_jump(_jit, _BGEU(r1, r0, 0));
+}
+
+static jit_reloc_t
+blei_u(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bler_u(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+beqr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return emit_cc_jump(_jit, _BEQ(r0, r1, 0));
+}
+
+static jit_reloc_t
+beqi(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = beqr(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bger(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return emit_cc_jump(_jit, _BGE(r0, r1, 0));
+}
+
+static jit_reloc_t
+bgei(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bger(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bger_u(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return emit_cc_jump(_jit, _BGEU(r0, r1, 0));
+}
+
+static jit_reloc_t
+bgei_u(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bger_u(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bgtr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return bltr(_jit, r1, r0);
+}
+
+static jit_reloc_t
+bgti(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bgtr(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bgtr_u(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return bltr_u(_jit, r1, r0);
+}
+
+static jit_reloc_t
+bgti_u(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bgtr_u(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bner(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return emit_cc_jump(_jit, _BNE(r0, r1, 0));
+}
+
+static jit_reloc_t
+bnei(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bner(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bmsr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  andr(_jit, jit_gpr_regno(t0), r0, r1);
+  jit_reloc_t ret = bner(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bmsi(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  andi(_jit, jit_gpr_regno(t0), r0, i0);
+  jit_reloc_t ret = bner(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bmcr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  andr(_jit, jit_gpr_regno(t0), r0, r1);
+  jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bmci(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  andi(_jit, jit_gpr_regno(t0), r0, i0);
+  jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+boaddr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  // NOTE: We need tons of temporaries because RISC-V doesn't provide any
+  // easy way to solve this. We need to do it in software.
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  jit_gpr_t t1 = get_temp_gpr(_jit);
+  jit_gpr_t t2 = get_temp_gpr(_jit);
+
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+
+  em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0));
+  em_wp(_jit, _SLT(jit_gpr_regno(t2), jit_gpr_regno(t0), r0));
+  movr(_jit, r0, jit_gpr_regno(t0));
+  jit_reloc_t ret = bner(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2));
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+boaddi(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = boaddr(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+boaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  jit_gpr_t t1 = get_temp_gpr(_jit);
+
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+
+  em_wp(_jit, _SLTU(jit_gpr_regno(t1), jit_gpr_regno(t0), r0));
+  movr(_jit, r0, jit_gpr_regno(t0));
+
+  jit_reloc_t ret = bnei(_jit, jit_gpr_regno(t1), 0);
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+boaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = boaddr_u(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bxaddr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  jit_gpr_t t1 = get_temp_gpr(_jit);
+  jit_gpr_t t2 = get_temp_gpr(_jit);
+
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+
+  em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0));
+  em_wp(_jit, _SLT(jit_gpr_regno(t2), jit_gpr_regno(t0), r0));
+  movr(_jit, r0, jit_gpr_regno(t0));
+  jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2));
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bxaddi(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bxaddr(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bxaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  jit_gpr_t t1 = get_temp_gpr(_jit);
+
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+
+  em_wp(_jit, _SLTU(jit_gpr_regno(t1), jit_gpr_regno(t0), r0));
+  movr(_jit, r0, jit_gpr_regno(t0));
+
+  jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 0);
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bxaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bxaddr_u(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bosubr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  jit_gpr_t t1 = get_temp_gpr(_jit);
+  jit_gpr_t t2 = get_temp_gpr(_jit);
+
+  subr(_jit, jit_gpr_regno(t0), r0, r1);
+
+  em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0));
+  em_wp(_jit, _SLT(jit_gpr_regno(t2), r0, jit_gpr_regno(t0)));
+  movr(_jit, r0, jit_gpr_regno(t0));
+  jit_reloc_t ret = bner(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2));
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bosubi(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bosubr(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bosubr_u(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  jit_gpr_t t1 = get_temp_gpr(_jit);
+
+  subr(_jit, jit_gpr_regno(t0), r0, r1);
+
+  em_wp(_jit, _SLTU(jit_gpr_regno(t1), r0, jit_gpr_regno(t0)));
+  movr(_jit, r0, jit_gpr_regno(t0));
+  jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 1);
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bosubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bosubr_u(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bxsubr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  jit_gpr_t t1 = get_temp_gpr(_jit);
+  jit_gpr_t t2 = get_temp_gpr(_jit);
+
+  subr(_jit, jit_gpr_regno(t0), r0, r1);
+
+  em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0));
+  em_wp(_jit, _SLT(jit_gpr_regno(t2), r0, jit_gpr_regno(t0)));
+  movr(_jit, r0, jit_gpr_regno(t0));
+  jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2));
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bxsubi(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bxsubr(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bxsubr_u(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  jit_gpr_t t1 = get_temp_gpr(_jit);
+
+  subr(_jit, jit_gpr_regno(t0), r0, r1);
+
+  em_wp(_jit, _SLTU(jit_gpr_regno(t1), r0, jit_gpr_regno(t0)));
+  movr(_jit, r0, jit_gpr_regno(t0));
+  jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 0);
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bxsubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  jit_reloc_t ret = bxsubr_u(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+
+/*
+ * Transfer operations
+ */
+static void
+movr(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  if (r0 != r1)
+    em_wp(_jit, _MV(r0, r1));
+}
+
+static void
+movi(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  int32_t srcreg = jit_gpr_regno(_ZERO);
+  if (simm32_p(i0)){
+
+    int64_t hi = ((i0 + 0x800) >> 12) & 0xFFFFF;
+    int64_t lo = (int32_t)i0<<20>>20;
+
+    if(hi){
+      em_wp(_jit, _LUI(r0, hi));
+      srcreg = r0;
+    }
+
+    if(lo || hi == 0){
+      em_wp(_jit, _ADDI(r0, srcreg, lo));
+    }
+
+  } else {
+    // 64 bits: load in various steps
+    // lui, addi, slli, addi, slli, addi, slli, addi
+    int64_t hh = (i0>>44);
+    int64_t hl = (i0>>33) - (hh<<11);
+    int64_t lh = (i0>>22) - ((hh<<22) + (hl<<11));
+    int64_t lm = (i0>>11) - ((hh<<33) + (hl<<22) + (lh<<11));
+    int64_t ll = i0       - ((hh<<44) + (hl<<33) + (lh<<22) + (lm<<11));
+
+
+    em_wp(_jit, _LUI(r0, hh));
+    em_wp(_jit, _SLLI(r0, r0, 32));
+    em_wp(_jit, _SRLI(r0, r0, 33));
+    em_wp(_jit, _ADDI(r0, r0, hl));
+
+    em_wp(_jit, _SLLI(r0, r0, 11));
+    em_wp(_jit, _ADDI(r0, r0, lh));
+
+    em_wp(_jit, _SLLI(r0, r0, 11));
+    em_wp(_jit, _ADDI(r0, r0, lm));
+
+    em_wp(_jit, _SLLI(r0, r0, 11));
+    em_wp(_jit, _ADDI(r0, r0, ll));
+  }
+}
+
+typedef union{
+  struct{
+    instr_t auipc;
+    instr_t load;        // `ld` in RV64 and `lw` in RV32
+  } inst;
+  uint64_t l;
+} load_from_pool_t;
+
+static uint64_t
+patch_load_from_pool(uint64_t instrs, uint32_t off){
+  load_from_pool_t i;
+  i.l = instrs;
+  i.inst.auipc.U.imm31_12 = off & 0xFFFFF000;
+  i.inst.load.I.imm11_0   = off & 0x00000FFF;
+  return i.l;
+}
+
+static jit_reloc_t
+emit_load_from_pool(jit_state_t *_jit, uint64_t insts)
+{
+  while (1) {
+    uint8_t *pc_base = _jit->pc.uc;   // Offset is from current PC
+    int32_t off = (_jit->pc.uc - pc_base);
+    jit_reloc_t ret =
+      jit_reloc (_jit, JIT_RELOC_LOAD_FROM_POOL, 0, _jit->pc.uc, pc_base, 0);
+    uint8_t load_from_pool_width = 32;
+    if (add_pending_literal(_jit, ret, load_from_pool_width)) {
+      emit_u64(_jit, patch_load_from_pool(insts, off));
+      return ret;
+    }
+  }
+}
+static jit_reloc_t
+movi_from_pool(jit_state_t *_jit, int32_t r0)
+{
+  load_from_pool_t insts;
+  insts.inst.auipc.w = _AUIPC(r0, 0);
+#if __WORDSIZE == 64
+  insts.inst.load.w  = _LD(r0, r0, 0);
+#elif __WORDSIZE == 32
+  insts.inst.load.w  = _LW(r0, r0, 0);
+#endif
+  return emit_load_from_pool(_jit, insts.l);
+}
+static jit_reloc_t
+mov_addr(jit_state_t *_jit, int32_t r0)
+{
+  return movi_from_pool(_jit, r0);
+}
+
+
+static void
+extr_c(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int rot = __WORDSIZE - 8;
+  lshi(_jit, r0, r1, rot);
+  rshi(_jit, r0, r0, rot);
+}
+
+static void
+extr_uc(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int rot = __WORDSIZE - 8;
+  lshi(_jit, r0, r1, rot);
+  rshi_u(_jit, r0, r0, rot);
+}
+
+static void
+extr_s(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int rot = __WORDSIZE - 16;
+  lshi(_jit, r0, r1, rot);
+  rshi(_jit, r0, r0, rot);
+}
+
+static void
+extr_us(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int rot = __WORDSIZE - 16;
+  lshi(_jit, r0, r1, rot);
+  rshi_u(_jit, r0, r0, rot);
+}
+
+#  if __WORDSIZE == 64
+static void
+extr_i(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int rot = __WORDSIZE - 32;
+  lshi(_jit, r0, r1, rot);
+  rshi(_jit, r0, r0, rot);
+}
+static void
+extr_ui(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int rot = __WORDSIZE - 32;
+  lshi(_jit, r0, r1, rot);
+  rshi_u(_jit, r0, r0, rot);
+}
+#endif
+
+/*
+ * Store operations
+ */
+static void
+str_c(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _SB(r0, r1, 0));
+}
+static void
+str_uc(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _SB(r0, r1, 0));
+}
+static void
+str_s(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _SH(r0, r1, 0));
+}
+static void
+str_i(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _SW(r0, r1, 0));
+}
+#if __WORDSIZE == 64
+static void
+str_l(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _SD(r0, r1, 0));
+}
+#endif
+
+static void
+sti_c(jit_state_t *_jit, jit_word_t i0, int32_t r0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  str_c(_jit, jit_gpr_regno(t0), r0);
+  unget_temp_gpr(_jit);
+}
+
+static void
+sti_s(jit_state_t *_jit, jit_word_t i0, int32_t r0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  str_s(_jit, jit_gpr_regno(t0), r0);
+  unget_temp_gpr(_jit);
+}
+
+static void
+sti_i(jit_state_t *_jit, jit_word_t i0, int32_t r0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  str_i(_jit, jit_gpr_regno(t0), r0);
+  unget_temp_gpr(_jit);
+}
+
+#if __WORDSIZE == 64
+static void
+sti_l(jit_state_t *_jit, jit_word_t i0, int32_t r0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  str_l(_jit, jit_gpr_regno(t0), r0);
+  unget_temp_gpr(_jit);
+}
+#endif
+
+static void
+stxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+  str_c(_jit, jit_gpr_regno(t0), r2);
+  unget_temp_gpr(_jit);
+}
+
+static void
+stxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+  str_s(_jit, jit_gpr_regno(t0), r2);
+  unget_temp_gpr(_jit);
+}
+
+static void
+stxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+  str_i(_jit, jit_gpr_regno(t0), r2);
+  unget_temp_gpr(_jit);
+}
+
+# if __WORDSIZE == 64
+static void
+stxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+  str_l(_jit, jit_gpr_regno(t0), r2);
+  unget_temp_gpr(_jit);
+}
+#endif
+
+static void
+stxi_c(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _SB(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r0, i0);
+    str_c(_jit, jit_gpr_regno(t0), r1);
+    unget_temp_gpr(_jit);
+  }
+}
+
+
+static void
+stxi_s(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _SH(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r0, i0);
+    str_s(_jit, jit_gpr_regno(t0), r1);
+    unget_temp_gpr(_jit);
+  }
+}
+
+
+static void
+stxi_i(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _SW(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r0, i0);
+    str_i(_jit, jit_gpr_regno(t0), r1);
+    unget_temp_gpr(_jit);
+  }
+}
+
+# if __WORDSIZE == 64
+static void
+stxi_l(jit_state_t *_jit,jit_word_t i0,int32_t r0,int32_t r1)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _SD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r0, i0);
+    str_l(_jit, jit_gpr_regno(t0), r1);
+    unget_temp_gpr(_jit);
+  }
+}
+# endif
+
+
+/*
+ * Load operations
+ */
+static void
+ldr_c(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _LB(r0, r1, 0));
+}
+
+static void
+ldr_uc(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _LBU(r0, r1, 0));
+}
+
+static void
+ldr_s(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _LH(r0, r1, 0));
+}
+
+static void
+ldr_us(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _LHU(r0, r1, 0));
+}
+
+static void
+ldr_i(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _LW(r0, r1, 0));
+}
+
+# if __WORDSIZE == 64
+static void
+ldr_ui(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _LWU(r0, r1, 0));
+}
+
+static void
+ldr_l(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _LD(r0, r1, 0));
+}
+# endif
+
+
+static void
+ldi_c(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_c(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+ldi_uc(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_uc(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+ldi_s(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_s(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+ldi_us(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_us(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+
+static void
+ldi_i(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_i(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+#  if __WORDSIZE == 64
+static void
+ldi_ui(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_ui(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+ldi_l(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_l(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+#endif
+
+
+
+
+static void
+ldxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_c(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+ldxr_uc(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_uc(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+ldxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_s(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+ldxr_us(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_us(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+ldxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_i(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+#  if __WORDSIZE == 64
+static void
+ldxr_ui(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_ui(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+ldxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_l(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+#endif
+
+
+
+
+static void
+ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _LD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_c(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+static void
+ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _LD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_uc(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+static void
+ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _LD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_us(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+static void
+ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _LD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_s(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+static void
+ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _LD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_i(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+#  if __WORDSIZE == 64
+static void
+ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _LD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_ui(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+static void
+ldxi_l(jit_state_t *_jit,int32_t r0,int32_t r1,jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _LD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_l(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+#endif
+
+
+/*
+ * Argument management
+ */
+
+// static void
+// pushr(jit_state_t *_jit, int32_t r0)
+// {
+// #if __WORDSIZE == 64
+//   addi(jit_gpr_regno(_SP), -8);
+//   em_wp(_SD(r0, jit_gpr_regno(_SP), 0));
+// #elif __WORDSIZE == 32
+//   addi(jit_gpr_regno(_SP), -4);
+//   em_wp(_SW(r0, jit_gpr_regno(_SP), 0));
+// #endif
+// }
+// static void
+// popr(jit_state_t *_jit, int32_t r0)
+// {
+// #if __WORDSIZE == 64
+//   em_wp(_jit, _LD(r0, jit_gpr_regno(_SP), 0));
+//   addi(jit_gpr_regno(_SP), 8);
+// #elif __WORDSIZE == 32
+//   em_wp(_jit, _LW(r0, jit_gpr_regno(_SP), 0));
+//   addi(jit_gpr_regno(_SP), 4);
+// #endif
+// }
+
+static void
+ret(jit_state_t *_jit)
+{
+  em_wp(_jit, _RET());
+}
+
+static void
+retr(jit_state_t *_jit, int32_t r0)
+{
+  movr(_jit, jit_gpr_regno(_A0), r0);
+  ret(_jit);
+}
+
+static void
+reti(jit_state_t *_jit, jit_word_t i0)
+{
+  movi(_jit, jit_gpr_regno(_A0), i0);
+  ret(_jit);
+}
+
+static void
+retval_c(jit_state_t *_jit, int32_t r0)
+{
+  extr_c(_jit, r0, jit_gpr_regno(_A0));
+}
+
+static void
+retval_uc(jit_state_t *_jit, int32_t r0)
+{
+  extr_uc(_jit, r0, jit_gpr_regno(_A0));
+}
+
+static void
+retval_s(jit_state_t *_jit, int32_t r0)
+{
+  extr_s(_jit, r0, jit_gpr_regno(_A0));
+}
+
+static void
+retval_us(jit_state_t *_jit, int32_t r0)
+{
+  extr_us(_jit, r0, jit_gpr_regno(_A0));
+}
+
+static void
+retval_i(jit_state_t *_jit, int32_t r0)
+{
+  extr_i(_jit, r0, jit_gpr_regno(_A0));
+}
+
+#  if __WORDSIZE == 64
+static void
+retval_ui(jit_state_t *_jit, int32_t r0)
+{
+  extr_ui(_jit, r0, jit_gpr_regno(_A0));
+}
+
+static void
+retval_l(jit_state_t *_jit, int32_t r0)
+{
+  movr(_jit, r0, jit_gpr_regno(_A0));
+}
+#endif
+
+/*
+ * Jump and return instructions
+ */
+static uint32_t
+patch_jump(uint32_t inst, int32_t offset)
+{
+  instr_t i;
+  i.w = inst;
+  i.J.imm20   = (offset >> 20) &   0x1;
+  i.J.imm19_12= (offset >> 12) &  0xff;
+  i.J.imm11   = (offset >> 11) &   0x1;
+  i.J.imm10_1 = (offset >>  1) & 0x3ff;
+  return i.w;
+}
+static jit_reloc_t
+emit_jump(jit_state_t *_jit, uint32_t inst)
+{
+  while (1) {
+    uint8_t *pc_base = _jit->pc.uc;   // Offset is from current PC
+    int32_t off = (uint8_t*)jit_address(_jit) - pc_base;
+    jit_reloc_t ret =
+      jit_reloc (_jit, JIT_RELOC_JMP_WITH_VENEER, 0, _jit->pc.uc, pc_base, 0);
+    uint8_t jump_width = 20;
+    if (add_pending_literal(_jit, ret, jump_width - 1)) {
+      em_wp(_jit, patch_jump(inst, off));
+      return ret;
+    }
+  }
+}
+
+static void
+callr(jit_state_t *_jit, int32_t r0)
+{
+  em_wp(_jit, _JALR(jit_gpr_regno(_RA), r0, 0));
+}
+
+static void
+calli(jit_state_t *_jit, jit_word_t i0)
+{
+  jit_word_t jumpoffset = i0 - (jit_word_t)(_jit->pc.uc);
+  if (simm20_p(jumpoffset)){
+    em_wp(_jit, _JAL(jit_gpr_regno(_RA), jumpoffset));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    callr(_jit, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+static void
+jmpi_with_link(jit_state_t *_jit, jit_word_t i0)
+{
+  calli(_jit, i0);
+}
+
+static void
+pop_link_register(jit_state_t *_jit)
+{
+}
+
+static void
+push_link_register(jit_state_t *_jit)
+{
+}
+
+static void
+jmpr(jit_state_t *_jit, int32_t r0)
+{
+  em_wp(_jit, _JALR(jit_gpr_regno(_ZERO), r0, 0));
+}
+
+static void
+jmpi(jit_state_t *_jit, jit_word_t i0)
+{
+  jit_word_t jumpoffset = i0 - (jit_word_t)(_jit->pc.uc);
+  if (simm20_p(jumpoffset)){
+    em_wp(_jit, _JAL(jit_gpr_regno(_ZERO), jumpoffset));
+  } else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    movi(_jit, jit_gpr_regno(t0), i0);
+    jmpr(_jit, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+static jit_reloc_t
+jmp(jit_state_t *_jit)
+{
+  return emit_jump(_jit, _JAL(jit_gpr_regno(_ZERO), 0));
+}
+
+
+
+/*
+ * Atomic operations
+ */
+
+static void
+ldr_atomic(jit_state_t *_jit, int32_t dst, int32_t loc)
+{
+  em_wp(_jit, _FENCE(0xFF));
+  ldr_i(_jit, dst, loc);
+  em_wp(_jit, _FENCE(0xFF));
+}
+
+static void
+str_atomic(jit_state_t *_jit, int32_t loc, int32_t val)
+{
+  em_wp(_jit, _FENCE(0xFF));
+  str_i(_jit, loc, val);
+  em_wp(_jit, _FENCE(0xFF));
+}
+
+static void
+swap_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, int32_t val)
+{
+#if __WORDSIZE == 64
+  em_wp(_jit, _AMOSWAP_D(dst, loc, val, 1, 1));
+#elif __WORDSIZE == 32
+  em_wp(_jit, _AMOSWAP_W(dst, loc, val, 1, 1));
+#endif
+}
+
+static void
+cas_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, int32_t expected,
+           int32_t desired)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  void *retry = jit_address(_jit);
+
+#if __WORDSIZE == 64
+  em_wp(_jit, _LR_D(t0, loc, 0,0));
+#elif __WORDSIZE == 32
+  em_wp(_jit, _LR_W(t0, loc, 0,0));
+#endif
+
+  jit_reloc_t fail = bner(_jit, t0, expected);
+
+#if __WORDSIZE == 64
+  em_wp(_jit, _SC_D(t1, desired, loc, 0,0));
+#elif __WORDSIZE == 32
+  em_wp(_jit, _SC_W(t1, desired, loc, 0,0));
+#endif
+
+  jit_patch_there(_jit, bner(_jit, t1, jit_gpr_regno(_ZERO)), retry);
+
+  jit_patch_here(_jit, fail);
+  em_wp(_jit, _FENCE(0xFF));
+  movr(_jit, dst, t0);
+
+  unget_temp_gpr(_jit);
+  unget_temp_gpr(_jit);
+}
+
+
+/*
+ * Byte swapping operations
+ * RISC-V Doesn't provide them by default.
+ * There's a B extension (Standard Extension for Bit Manipulation) draft, but
+ * it's not official yet:
+ *     https://github.com/riscv/riscv-bitmanip
+ * Meanwhile, we need to implement them in software.
+ */
+static void
+bswapr_uany(jit_state_t *_jit, int32_t r0, int32_t r1, size_t size)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  andi(_jit, r0, r1, 0xFF);
+  for(int i = 1; i < size; i++){
+    lshi(_jit, r0, r0, 8);
+    rshi(_jit, t0, r1, 8*i);
+    andi(_jit, t0, t0, 0xFF);
+    orr(_jit, r0, r0, t0);
+  }
+  unget_temp_gpr(_jit);
+}
+
+static void
+bswapr_us(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  bswapr_uany(_jit, r0, r1, 2);
+}
+
+static void
+bswapr_ui(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  bswapr_uany(_jit, r0, r1, 4);
+}
+
+#  if __WORDSIZE == 64
+static void
+bswapr_ul(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  bswapr_uany(_jit, r0, r1, 8);
+}
+#endif
+
+
+
+/*
+ * Others
+ * TODO
+ */
+static void
+nop(jit_state_t *_jit, int32_t im)
+{
+  for (; im > 0; im -= 4)
+    em_wp(_jit, _NOP());
+  assert(im == 0);
+}
+static void
+mfence(jit_state_t *_jit)
+{
+  // TODO: we may need it for atomic operations?
+}
+
+static void
+breakpoint(jit_state_t *_jit)
+{
+  em_wp(_jit, _EBREAK());
+}
diff --git a/lightening/riscv-fpu.c b/lightening/riscv-fpu.c
new file mode 100644
index 000000000..315ed8d14
--- /dev/null
+++ b/lightening/riscv-fpu.c
@@ -0,0 +1,858 @@
+/*
+ * RV32F Standard Extension
+ */
+#define _FLW(rd, rs1, im)              Itype(7, rd, 2, rs1, im)
+#define _FSW(rs1, rs2, imm)            Stype(39, 2, rs1, rs2, imm)
+#define _FMADD_S(rd, rs1, rs2, rs3)    R4type(67, rd, 0, rs1, rs2, 0, rs3)
+#define _FMSUB_S(rd, rs1, rs2, rs3)    R4type(71, rd, 0, rs1, rs2, 0, rs3)
+#define _FNMSUB_S(rd, rs1, rs2, rs3)   R4type(75, rd, 0, rs1, rs2, 0, rs3)
+#define _FNMADD_S(rd, rs1, rs2, rs3)   R4type(79, rd, 0, rs1, rs2, 0, rs3)
+#define _FADD_S(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 0)
+#define _FSUB_S(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 4)
+#define _FMUL_S(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 8)
+#define _FDIV_S(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 12)
+#define _FSQRT_S(rd, rs1)              Rtype(83, rd, 0, rs1, 0, 44)
+#define _FSGNJ_S(rd, rs1, rs2)         Rtype(83, rd, 0, rs1, rs2, 16)
+#define _FSGNJN_S(rd, rs1, rs2)        Rtype(83, rd, 1, rs1, rs2, 16)
+#define _FSGNJX_S(rd, rs1, rs2)        Rtype(83, rd, 2, rs1, rs2, 16)
+#define _FMIN_S(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 20)
+#define _FMAX_S(rd, rs1, rs2)          Rtype(83, rd, 1, rs1, rs2, 20)
+#define _FCVT_W_S(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 0, 96)
+#define _FCVT_WU_S(rd, rs1, rm)        Rtype(83, rd, rm, rs1, 1, 96)
+#define _FMV_X_W(rd, rs1)              Rtype(83, rd, 0, rs1, 0, 112)
+#define _FEQ_S(rd, rs1, rs2)           Rtype(83, rd, 2, rs1, rs2, 80)
+#define _FLT_S(rd, rs1, rs2)           Rtype(83, rd, 1, rs1, rs2, 80)
+#define _FLE_S(rd, rs1, rs2)           Rtype(83, rd, 0, rs1, rs2, 80)
+#define _FCLASS_S(rd, rs1)             Rtype(83, rd, 1, rs1, 0, 112)
+#define _FCVT_S_W(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 0, 104)
+#define _FCVT_S_WU(rd, rs1, rm)        Rtype(83, rd, rm, rs1, 1, 104)
+#define _FMV_W_X(rd, rs1)              Rtype(83, rd, 0, rs1, 0, 120)
+/*
+ * RV64F Standard Extension (in addition to RV32F)
+ */
+#define _FCVT_L_S(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 2, 96)
+#define _FCVT_LU_S(rd, rs1, rm)        Rtype(83, rd, rm, rs1, 3, 96)
+#define _FCVT_S_L(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 2, 104)
+#define _FCVT_S_LU(rd, rs1, rm)        Rtype(83, rd, rm, rs1, 3, 104)
+/*
+ * RV32D Standard Extension
+ */
+#define _FLD(rd, rs1, im)              Itype(7, rd, 3, rs1, im)
+#define _FSD(rs1, rs2, imm)            Stype(39, 3, rs1, rs2, imm)
+#define _FMADD_D(rd, rs1, rs2, rs3)    R4type(67, rd, 0, rs1, rs2, 1, rs3)
+#define _FMSUB_D(rd, rs1, rs2, rs3)    R4type(71, rd, 0, rs1, rs2, 1, rs3)
+#define _FNMSUB_D(rd, rs1, rs2, rs3)   R4type(75, rd, 0, rs1, rs2, 1, rs3)
+#define _FNMADD_D(rd, rs1, rs2, rs3)   R4type(79, rd, 0, rs1, rs2, 1, rs3)
+#define _FADD_D(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 1)
+#define _FSUB_D(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 5)
+#define _FMUL_D(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 9)
+#define _FDIV_D(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 13)
+#define _FSQRT_D(rd, rs1)              Rtype(83, rd, 0, rs1, 0, 45)
+#define _FSGNJ_D(rd, rs1, rs2)         Rtype(83, rd, 0, rs1, rs2, 17)
+#define _FSGNJN_D(rd, rs1, rs2)        Rtype(83, rd, 1, rs1, rs2, 17)
+#define _FSGNJX_D(rd, rs1, rs2)        Rtype(83, rd, 2, rs1, rs2, 17)
+#define _FMIN_D(rd, rs1, rs2)          Rtype(83, rd, 0, rs1, rs2, 21)
+#define _FMAX_D(rd, rs1, rs2)          Rtype(83, rd, 1, rs1, rs2, 21)
+#define _FCVT_S_D(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 1, 32)
+#define _FCVT_D_S(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 0, 33)
+#define _FEQ_D(rd, rs1, rs2)           Rtype(83, rd, 2, rs1, rs2, 81)
+#define _FLT_D(rd, rs1, rs2)           Rtype(83, rd, 1, rs1, rs2, 81)
+#define _FLE_D(rd, rs1, rs2)           Rtype(83, rd, 0, rs1, rs2, 81)
+#define _FCLASS_D(rd, rs1)             Rtype(83, rd, 1, rs1, 0, 113)
+#define _FCVT_W_D(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 0, 97)
+#define _FCVT_WU_D(rd, rs1, rm)        Rtype(83, rd, rm, rs1, 1, 97)
+#define _FCVT_D_W(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 0, 105)
+#define _FCVT_D_WU(rd, rs1, rm)        Rtype(83, rd, rm, rs1, 1, 105)
+/*
+ * RV64D Standard Extension (in addition to RV32D)
+ */
+#define _FCVT_L_D(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 2, 97)
+#define _FCVT_LU_D(rd, rs1, rm)        Rtype(83, rd, rm, rs1, 3, 97)
+#define _FMV_X_D(rd, rs1)              Rtype(83, rd, 0, rs1, 0, 113)
+#define _FCVT_D_L(rd, rs1, rm)         Rtype(83, rd, rm, rs1, 2, 105)
+#define _FCVT_D_LU(rd, rs1, rm)        Rtype(83, rd, rm, rs1, 3, 105)
+#define _FMV_D_X(rd, rs1)              Rtype(83, rd, 0, rs1, 0, 121)
+/*
+ * Pseudo instructions
+ */
+#define _FMV_S(r0, r1)                 _FSGNJ_S(r0, r1, r1)
+#define _FABS_S(r0, r1)                _FSGNJX_S(r0, r1, r1)
+#define _FNEG_S(r0, r1)                _FSGNJN_S(r0, r1, r1)
+#define _FMV_D(r0, r1)                 _FSGNJ_D(r0, r1, r1)
+#define _FABS_D(r0, r1)                _FSGNJX_D(r0, r1, r1)
+#define _FNEG_D(r0, r1)                _FSGNJN_D(r0, r1, r1)
+
+// Binary ALU operations
+static void addr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void addr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void subr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void subr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void mulr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void mulr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void divr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void divr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+
+// Unary ALU operations
+static void sqrtr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void sqrtr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void negr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void negr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void absr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void absr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+
+// Transfer operations
+static void movr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void movr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+
+// Argument management
+static void retr_f(jit_state_t *_jit, int32_t u);
+static void retr_d(jit_state_t *_jit, int32_t u);
+
+// Load operations
+static void ldr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void ldr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void ldi_f(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+static void ldxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void ldxi_f(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+static void ldi_d(jit_state_t *_jit, int32_t r0, jit_word_t i0);
+static void ldxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void ldxi_d(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0);
+
+// Store operations
+static void str_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void str_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static void sti_f(jit_state_t *_jit, jit_word_t i0, int32_t r0);
+static void stxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void stxi_f(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1);
+static void sti_d(jit_state_t *_jit, jit_word_t i0, int32_t r0);
+static void stxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2);
+static void stxi_d(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1);
+
+// Branch instructions
+static jit_reloc_t bltr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bler_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t beqr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bger_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bner_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bunltr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bunler_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bunger_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bungtr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bltgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bordr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bunordr_f(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bltr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bler_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t beqr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bger_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bner_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bunltr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bunler_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bunger_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bungtr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bltgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+static jit_reloc_t bordr_d(jit_state_t *_jit, int32_t r0, int32_t r1);
+
+/*
+ * Binary ALU operations
+ */
+static void
+addr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _FADD_S(r0, r1, r2));
+}
+static void
+addr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _FADD_D(r0, r1, r2));
+}
+static void
+subr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _FSUB_S(r0, r1, r2));
+}
+static void
+subr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _FSUB_D(r0, r1, r2));
+}
+static void
+mulr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _FMUL_S(r0, r1, r2));
+}
+static void
+mulr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _FMUL_D(r0, r1, r2));
+}
+static void
+divr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _FDIV_S(r0, r1, r2));
+}
+static void
+divr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  em_wp(_jit, _FDIV_D(r0, r1, r2));
+}
+
+/*
+ * Unary ALU operations
+ */
+static void
+sqrtr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FSQRT_S(r0, r1));
+}
+static void
+sqrtr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FSQRT_D(r0, r1));
+}
+static void
+negr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FNEG_S(r0, r1));
+}
+static void
+negr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FNEG_D(r0, r1));
+}
+static void
+absr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FABS_S(r0, r1));
+}
+
+static void
+absr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FABS_D(r0, r1));
+}
+
+
+/*
+ * Load operations
+ */
+static void
+ldr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FLW(r0, r1, 0));
+}
+static void
+ldr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FLD(r0, r1, 0));
+}
+static void
+ldi_f(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_f(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+ldxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_f(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+static void
+ldxi_f(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _FLW(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_f(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+static void
+ldi_d(jit_state_t *_jit, int32_t r0, jit_word_t i0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  ldr_d(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+ldxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r1, r2);
+  ldr_d(_jit, r0, jit_gpr_regno(t0));
+  unget_temp_gpr(_jit);
+}
+
+static void
+ldxi_d(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _FLD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r1, i0);
+    ldr_d(_jit, r0, jit_gpr_regno(t0));
+    unget_temp_gpr(_jit);
+  }
+}
+
+
+
+/*
+ * Store operations
+ */
+static void
+str_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FSW(r0, r1, 0));
+}
+static void
+str_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FSD(r0, r1, 0));
+}
+static void
+sti_f(jit_state_t *_jit, jit_word_t i0, int32_t r0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  str_f(_jit, jit_gpr_regno(t0), r0);
+  unget_temp_gpr(_jit);
+}
+static void
+stxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+  str_f(_jit, jit_gpr_regno(t0), r2);
+  unget_temp_gpr(_jit);
+}
+static void
+stxi_f(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _FSW(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r0, i0);
+    str_f(_jit, jit_gpr_regno(t0), r1);
+    unget_temp_gpr(_jit);
+  }
+}
+static void
+sti_d(jit_state_t *_jit, jit_word_t i0, int32_t r0)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(t0), i0);
+  str_d(_jit, jit_gpr_regno(t0), r0);
+  unget_temp_gpr(_jit);
+}
+static void
+stxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2)
+{
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  addr(_jit, jit_gpr_regno(t0), r0, r1);
+  str_d(_jit, jit_gpr_regno(t0), r2);
+  unget_temp_gpr(_jit);
+}
+static void
+stxi_d(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1)
+{
+  if (simm12_p(i0))
+    em_wp(_jit, _FSD(r0, r1, i0));
+  else {
+    jit_gpr_t t0 = get_temp_gpr(_jit);
+    addi(_jit, jit_gpr_regno(t0), r0, i0);
+    str_d(_jit, jit_gpr_regno(t0), r1);
+    unget_temp_gpr(_jit);
+  }
+}
+
+
+/*
+ * Transfer operations
+ */
+static void
+movr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  if (r0 != r1)
+    em_wp(_jit, _FMV_S(r0, r1));
+}
+
+static void
+movr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  if (r0 != r1)
+    em_wp(_jit, _FMV_D(r0, r1));
+}
+static void
+truncr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FCVT_W_S(r0, r1, 1));
+}
+static void
+truncr_d_i(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FCVT_W_D(r0, r1, 1));
+}
+static void
+truncr_f_l(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FCVT_L_S(r0, r1, 1));
+}
+static void
+truncr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FCVT_L_D(r0, r1, 1));
+}
+
+static void
+extr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+#if __WORDSIZE == 64
+  em_wp(_jit, _FCVT_S_L(r0, r1, 0));
+#elif __WORDSIZE == 32
+  em_wp(_jit, _FCVT_S_W(r0, r1, 0));
+#endif
+}
+static void
+extr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+#if __WORDSIZE == 64
+  em_wp(_jit, _FCVT_D_L(r0, r1, 0));
+#elif __WORDSIZE == 32
+  em_wp(_jit, _FCVT_D_W(r0, r1, 0));
+#endif
+}
+
+static void
+extr_f_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FCVT_D_S(r0, r1, 0));
+}
+static void
+extr_d_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  em_wp(_jit, _FCVT_S_D(r0, r1, 0));
+}
+
+static void
+movi_f(jit_state_t *_jit, int32_t r0, jit_float32_t i0)
+{
+  union { int32_t i; jit_float32_t f; } u = { .f = i0 };
+  jit_gpr_t reg = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(reg), u.i);
+  em_wp(_jit, _FMV_W_X(r0, jit_gpr_regno(reg)));
+  unget_temp_gpr(_jit);
+}
+static void
+movi_d(jit_state_t *_jit, int32_t r0, jit_float64_t i0)
+{
+  // TODO: How to move a 64 bit value from a 32 bit X register?
+  // ATM only works on RV64
+  union { int64_t i; jit_float64_t f; } u = { .f = i0 };
+  jit_gpr_t reg = get_temp_gpr(_jit);
+  movi(_jit, jit_gpr_regno(reg), u.i);
+  em_wp(_jit, _FMV_D_X(r0, jit_gpr_regno(reg)));
+  unget_temp_gpr(_jit);
+}
+
+
+/*
+ * Argument management
+ */
+static void
+retval_f(jit_state_t *_jit, int32_t r0)
+{
+  movr_f(_jit, jit_fpr_regno(_FA0), r0);
+}
+
+static void
+retval_d(jit_state_t *_jit, int32_t r0)
+{
+  movr_d(_jit, jit_fpr_regno(_FA0), r0);
+}
+
+static void
+retr_f(jit_state_t *_jit, int32_t u)
+{
+  movr_f(_jit, jit_fpr_regno(_FA0), u);
+  ret(_jit);
+}
+
+static void
+retr_d(jit_state_t *_jit, int32_t u)
+{
+  movr_d(_jit, jit_fpr_regno(_FA0), u);
+  ret(_jit);
+}
+
+
+/*
+ * Branch instructions
+ */
+
+static jit_reloc_t
+bltr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLT_S(t0, r0, r1));
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bler_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLE_S(t0, r0, r1));
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+beqr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FEQ_S(t0, r0, r1));
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bger_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return bler_f(_jit, r1, r0);
+}
+
+static jit_reloc_t
+bgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return bltr_f(_jit, r1, r0);
+}
+
+static jit_reloc_t
+bner_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FEQ_S(t0, r0, r1));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bunltr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLE_S(t0, r1, r0));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bunler_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLT_S(t0, r1, r0));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  em_wp(_jit, _FLT_S(t0, r0, r1));
+  em_wp(_jit, _FLT_S(t1, r1, r0));
+  orr(_jit, t0, t0, t1);
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bunger_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLT_S(t0, r0, r1));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bungtr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLE_S(t0, r0, r1));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bltgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  em_wp(_jit, _FLT_S(t0, r1, r0));
+  em_wp(_jit, _FLT_S(t1, r0, r1));
+  orr(_jit, t0, t0, t1);
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bordr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  em_wp(_jit, _FEQ_S(t0, r0, r0));
+  em_wp(_jit, _FEQ_S(t1, r1, r1));
+  andr(_jit, t0, t0, t1);
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bunordr_f(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  em_wp(_jit, _FEQ_S(t0, r1, r1));
+  em_wp(_jit, _FEQ_S(t1, r0, r0));
+  andr(_jit, t0, t0, t1);
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bltr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLT_D(t0, r0, r1));
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bler_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLE_D(t0, r0, r1));
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+beqr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FEQ_D(t0, r0, r1));
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bger_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return bler_d(_jit, r1, r0);
+}
+
+static jit_reloc_t
+bgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  return bltr_d(_jit, r1, r0);
+}
+
+static jit_reloc_t
+bner_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FEQ_D(t0, r0, r1));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bunltr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLE_D(t0, r1, r0));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bunler_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLT_D(t0, r1, r0));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  em_wp(_jit, _FLT_D(t0, r0, r1));
+  em_wp(_jit, _FLT_D(t1, r1, r0));
+  orr(_jit, t0, t0, t1);
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bunger_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLT_D(t0, r0, r1));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bungtr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  jit_gpr_t tmp1 = get_temp_gpr(_jit);
+  int32_t t0 = jit_gpr_regno(tmp1);
+
+  em_wp(_jit, _FLE_D(t0, r0, r1));
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bltgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  em_wp(_jit, _FLT_D(t0, r1, r0));
+  em_wp(_jit, _FLT_D(t1, r0, r1));
+  orr(_jit, t0, t0, t1);
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bordr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  em_wp(_jit, _FEQ_D(t0, r0, r0));
+  em_wp(_jit, _FEQ_D(t1, r1, r1));
+  andr(_jit, t0, t0, t1);
+  jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
+
+static jit_reloc_t
+bunordr_d(jit_state_t *_jit, int32_t r0, int32_t r1)
+{
+  int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit));
+  int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit));
+
+  em_wp(_jit, _FEQ_D(t0, r1, r1));
+  em_wp(_jit, _FEQ_D(t1, r0, r0));
+  andr(_jit, t0, t0, t1);
+  jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO));
+
+  unget_temp_gpr(_jit);
+  return ret;
+}
diff --git a/lightening/riscv.c b/lightening/riscv.c
new file mode 100644
index 000000000..808192fae
--- /dev/null
+++ b/lightening/riscv.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2012-2021  Free Software Foundation, Inc.
+ *
+ * This file is part of GNU lightning.
+ *
+ * GNU lightning is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU lightning is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for more details.
+ *
+ * Authors:
+ *      Ekaitz Zarraga <ekaitz@elenq.tech>
+ */
+
+#include "riscv-cpu.c"
+#include "riscv-fpu.c"
+
+static const jit_gpr_t abi_gpr_args[] = {
+  _A0, _A1, _A2, _A3, _A4, _A5, _A6, _A7
+};
+static const jit_fpr_t abi_fpr_args[] = {
+  _FA0, _FA1, _FA2, _FA3, _FA4, _FA5, _FA6, _FA7
+};
+static const int abi_gpr_arg_count = sizeof(abi_gpr_args) / 
sizeof(abi_gpr_args[0]);
+static const int abi_fpr_arg_count = sizeof(abi_fpr_args) / 
sizeof(abi_fpr_args[0]);
+
+struct abi_arg_iterator
+{
+  const jit_operand_t *args;
+  size_t argc;
+
+  size_t arg_idx;
+  size_t gpr_idx;
+  size_t fpr_idx;
+  uint32_t vfp_used_registers;
+  size_t stack_size;
+  size_t stack_padding;
+};
+
+static size_t page_size;
+
+jit_bool_t
+jit_get_cpu(void)
+{
+  page_size = sysconf(_SC_PAGE_SIZE);
+  // FIXME check version, extensions, hardware fp support
+  //
+  // List of macro definitions for riscv support:
+  // -------------------------------------------
+  // __riscv: defined for any RISC-V target. Older versions of the GCC
+  // toolchain defined __riscv__.
+  //
+  // __riscv_xlen: 32 for RV32 and 64 for RV64.
+  //
+  // __riscv_float_abi_soft, __riscv_float_abi_single,
+  // __riscv_float_abi_double: one of these three will be defined, depending on
+  // target ABI.
+  //
+  // __riscv_cmodel_medlow, __riscv_cmodel_medany: one of these two will be
+  // defined, depending on the target code model.
+  //
+  // __riscv_mul: defined when targeting the 'M' ISA extension.
+  //
+  // __riscv_muldiv: defined when targeting the 'M' ISA extension and -mno-div
+  // has not been used.
+  //
+  // __riscv_div: defined when targeting the 'M' ISA extension and -mno-div has
+  // not been used.
+  //
+  // __riscv_atomic: defined when targeting the 'A' ISA extension.
+  //
+  // __riscv_flen: 32 when targeting the 'F' ISA extension (but not 'D') and 64
+  // when targeting 'FD'.
+  //
+  // __riscv_fdiv: defined when targeting the 'F' or 'D' ISA extensions and
+  // -mno-fdiv has not been used.
+  //
+  // __riscv_fsqrt: defined when targeting the 'F' or 'D' ISA extensions and
+  // -mno-fdiv has not been used.
+  //
+  // __riscv_compressed: defined when targeting the 'C' ISA extension.
+  return 1;
+}
+
+jit_bool_t
+jit_init(jit_state_t *_jit)
+{
+  return 1;
+}
+
+static size_t
+jit_initial_frame_size (void)
+{
+  return 0;
+}
+
+static void
+reset_abi_arg_iterator(struct abi_arg_iterator *iter, size_t argc,
+                       const jit_operand_t *args)
+{
+  memset(iter, 0, sizeof *iter);
+  iter->argc = argc;
+  iter->args = args;
+}
+
+static void
+next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg)
+{
+  ASSERT(iter->arg_idx < iter->argc);
+  enum jit_operand_abi abi = iter->args[iter->arg_idx].abi;
+  iter->arg_idx++;
+  if (is_gpr_arg(abi) && iter->gpr_idx < abi_gpr_arg_count) {
+    *arg = jit_operand_gpr (abi, abi_gpr_args[iter->gpr_idx++]);
+    return;
+  }
+  if (is_fpr_arg(abi) && iter->fpr_idx < abi_fpr_arg_count) {
+    *arg = jit_operand_fpr (abi, abi_fpr_args[iter->fpr_idx++]);
+    return;
+  }
+  *arg = jit_operand_mem (abi, JIT_SP, iter->stack_size);
+#if __WORDSIZE == 32
+  iter->stack_size += 4;
+#elif __WORDSIZE == 64
+  iter->stack_size += 8;
+#endif
+}
+
+static void
+jit_flush(void *fptr, void *tptr)
+{
+  jit_word_t f = (jit_word_t)fptr & -page_size;
+  jit_word_t t = (((jit_word_t)tptr) + page_size - 1) & -page_size;
+  __clear_cache((void *)f, (void *)t);
+}
+
+static inline size_t
+jit_stack_alignment(void)
+{
+  return 8;
+  // NOTE: See: https://github.com/riscv/riscv-gcc/issues/61
+}
+
+static void
+jit_try_shorten(jit_state_t *_jit, jit_reloc_t reloc, jit_pointer_t addr)
+{
+}
+
+static void*
+bless_function_pointer(void *ptr)
+{
+  return ptr;
+}
+
+
+/*
+ * Veneers
+ */
+struct veneer{
+  instr_t auipc;
+  instr_t load;        // `ld` in RV64 and `lw` in RV32
+  instr_t jalr;
+#if __WORDSIZE == 64
+  uint64_t address;
+#elif __WORDSIZE == 32
+  uint32_t address;
+#endif
+};
+
+static void
+emit_veneer(jit_state_t *_jit, jit_pointer_t target)
+{
+  // We need to generate something like this (RV64):
+  // ----------------------------------------------
+  // auipc t0, 0
+  // ld t0, 12(t0)
+  // jalr zero, 0(t0)
+  // ADDRESS_LITERAL
+  jit_gpr_t t0 = get_temp_gpr(_jit);
+  emit_u32(_jit, _AUIPC(jit_gpr_regno(t0), 0));
+#if __WORDSIZE == 64
+  emit_u32(_jit, _LD(jit_gpr_regno(t0), jit_gpr_regno(t0), 12));
+#elif __WORDSIZE == 32
+  emit_u32(_jit, _LW(jit_gpr_regno(t0), jit_gpr_regno(t0), 12));
+#endif
+  emit_u32(_jit, _JALR(jit_gpr_regno(_ZERO), jit_gpr_regno(t0), 0));
+#if __WORDSIZE == 64
+  emit_u64(_jit, (uint64_t) target);
+#elif __WORDSIZE == 32
+  emit_u32(_jit, (uint32_t) target);
+#endif
+  unget_temp_gpr(_jit);
+}
+
+static void
+patch_veneer(uint32_t *loc, jit_pointer_t addr)
+{
+  struct veneer *v = (struct veneer*) loc;
+#if __WORDSIZE == 64
+  v->address = (uint64_t) addr;
+#elif __WORDSIZE == 32
+  v->address = (uint32_t) addr;
+#endif
+}
+
+
+/*
+ * Conditional jumps
+ */
+static void
+patch_jcc_offset(uint32_t *loc, ptrdiff_t v)
+{
+
+  instr_t *i =  (instr_t *) loc;
+  i->B.imm11   = (v >> 11) & 0x1;
+  i->B.imm4_1  = (v >>  1) & 0xf;
+  i->B.imm10_5 = (v >>  5) & 0x3f;
+  i->B.imm12   = (v >> 12) & 0x1;
+}
+static void
+patch_veneer_jcc_offset(uint32_t *loc, ptrdiff_t offset){
+  patch_jcc_offset(loc, offset);
+}
+
+static int32_t
+read_jcc_offset(uint32_t *loc)
+{
+  instr_t i;
+  i.w = *loc;
+
+  int32_t offset = i.B.imm12 << 31;
+  offset >>= 20;
+  offset |= (i.B.imm11   << 11);
+  offset |= (i.B.imm10_5 <<  5);
+  offset |= (i.B.imm4_1  <<  1);
+
+  return offset;
+}
+static int
+offset_in_jcc_range(ptrdiff_t offset, int flags)
+{
+  if(offset & 1)
+    return 0;
+  else
+    return -0x1000 <= offset && offset <= 0xFFF;
+}
+
+/*
+ * Unconditional jumps
+ */
+static int32_t read_jmp_offset(uint32_t *loc)
+{
+  instr_t i;
+  i.w = *loc;
+
+  int32_t offset = i.J.imm20 << 31;
+  offset >>= 12;
+  offset |= (i.J.imm19_12 << 12);
+  offset |= (i.J.imm11    << 11);
+  offset |= (i.J.imm10_1  <<  1);
+  return offset;
+}
+static int
+offset_in_jmp_range(ptrdiff_t offset, int flags)
+{
+  if(offset & 1)
+    return 0;
+  else
+    return -0x100000 <= offset && offset <= 0xFFFFF;
+}
+
+static void
+patch_jmp_offset(uint32_t *loc, ptrdiff_t v)
+{
+  instr_t *i =  (instr_t *) loc;
+  i->J.imm20   = (v >> 20) &   0x1;
+  i->J.imm19_12= (v >> 12) &  0xff;
+  i->J.imm11   = (v >> 11) &   0x1;
+  i->J.imm10_1 = (v >>  1) & 0x3ff;
+}
+
+static void
+patch_veneer_jmp_offset(uint32_t *loc, ptrdiff_t offset)
+{
+  patch_jmp_offset(loc, offset);
+}
+
+
+/*
+ * Jumps around the veneer
+ */
+static void
+patch_jmp_without_veneer(jit_state_t *_jit, uint32_t *loc)
+{
+  patch_jmp_offset(loc, _jit->pc.ui - loc);
+}
+static uint32_t*
+jmp_without_veneer(jit_state_t *_jit)
+{
+  uint32_t *loc = _jit->pc.ui;
+  emit_u32(_jit, _JAL(jit_gpr_regno(_ZERO), 0));
+  return loc;
+}
+
+
+/*
+ * Load from pool offset
+ */
+static void
+patch_load_from_pool_offset(uint32_t *loc, int32_t v)
+{
+  load_from_pool_t *i = (load_from_pool_t *) loc;
+  int32_t hi20 = v >>12;
+  i->inst.auipc.U.imm31_12 = hi20;
+  i->inst.load.I.imm11_0   = v - (hi20<<12);
+}
+static int32_t
+read_load_from_pool_offset(uint32_t *loc)
+{
+  load_from_pool_t *i =  (load_from_pool_t*) loc;
+  return i->inst.auipc.U.imm31_12 + i->inst.load.I.imm11_0;
+}
+
diff --git a/lightening/riscv.h b/lightening/riscv.h
new file mode 100644
index 000000000..653d74bf9
--- /dev/null
+++ b/lightening/riscv.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2012-2021  Free Software Foundation, Inc.
+ *
+ * This file is part of GNU lightning.
+ *
+ * GNU lightning is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU lightning is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for more details.
+ *
+ * Authors:
+ *     Ekaitz Zarraga <ekaitz@elenq.tech>
+ */
+
+#ifndef _jit_riscv_h
+#define _jit_riscv_h
+
+#define JIT_NEEDS_LITERAL_POOL 1
+
+// x registers
+// Special registers
+#define _RA     JIT_GPR(1)      // Return address
+#define _SP     JIT_GPR(2)      // Stack pointer
+#define _GP     JIT_GPR(3)      // Global pointer
+#define _TP     JIT_GPR(4)      // Thread pointer
+#define _FP     JIT_GPR(8)      // Frame pointer
+#define _ZERO   JIT_GPR(0)      // Always zero
+// Argument passing
+#define _A0     JIT_GPR(10)
+#define _A1     JIT_GPR(11)
+#define _A2     JIT_GPR(12)
+#define _A3     JIT_GPR(13)
+#define _A4     JIT_GPR(14)
+#define _A5     JIT_GPR(15)
+#define _A6     JIT_GPR(16)
+#define _A7     JIT_GPR(17)
+// Saved registers
+#define _S0     _FP             // S0 is the frame pointer normally
+#define _S1     JIT_GPR(9)
+#define _S2     JIT_GPR(18)
+#define _S3     JIT_GPR(19)
+#define _S4     JIT_GPR(20)
+#define _S5     JIT_GPR(21)
+#define _S6     JIT_GPR(22)
+#define _S7     JIT_GPR(23)
+#define _S8     JIT_GPR(24)
+#define _S9     JIT_GPR(25)
+#define _S10    JIT_GPR(26)
+#define _S11    JIT_GPR(27)
+// Temporaries
+#define _T0     JIT_GPR(5)
+#define _T1     JIT_GPR(6)
+#define _T2     JIT_GPR(7)
+#define _T3     JIT_GPR(28)
+#define _T4     JIT_GPR(29)
+#define _T5     JIT_GPR(30)
+#define _T6     JIT_GPR(31)
+
+// f registers
+// Termporaries
+#define _FT0    JIT_FPR(0)
+#define _FT1    JIT_FPR(1)
+#define _FT2    JIT_FPR(2)
+#define _FT3    JIT_FPR(3)
+#define _FT4    JIT_FPR(4)
+#define _FT5    JIT_FPR(5)
+#define _FT6    JIT_FPR(6)
+#define _FT7    JIT_FPR(7)
+#define _FT8    JIT_FPR(28)
+#define _FT9    JIT_FPR(29)
+#define _FT10   JIT_FPR(30)
+#define _FT11   JIT_FPR(31)
+// Saved registers
+#define _FS0    JIT_FPR(8)
+#define _FS1    JIT_FPR(9)
+#define _FS2    JIT_FPR(18)
+#define _FS3    JIT_FPR(19)
+#define _FS4    JIT_FPR(20)
+#define _FS5    JIT_FPR(21)
+#define _FS6    JIT_FPR(22)
+#define _FS7    JIT_FPR(23)
+#define _FS8    JIT_FPR(24)
+#define _FS9    JIT_FPR(25)
+#define _FS10   JIT_FPR(26)
+#define _FS11   JIT_FPR(27)
+// Argument passing
+#define _FA0    JIT_FPR(10)
+#define _FA1    JIT_FPR(11)
+#define _FA2    JIT_FPR(12)
+#define _FA3    JIT_FPR(13)
+#define _FA4    JIT_FPR(14)
+#define _FA5    JIT_FPR(15)
+#define _FA6    JIT_FPR(16)
+#define _FA7    JIT_FPR(17)
+
+
+// JIT Registers
+// ----------------------------------------------------------------------
+// Caller-save registers                            JIT_R${NUM}
+// Callee-save registers                            JIT_V${NUM}
+// Caller-save temporary registers                  JIT_TMP${NUM}
+// Caller-save floating point registers             JIT_F${NUM}
+// Callee-save floating point registers             JIT_VF${NUM}
+// Caller-save floating point temporary registers   JIT_FTMP${NUM}
+
+// Caller-save registers
+#define JIT_R0  _A0
+#define JIT_R1  _A1
+#define JIT_R2  _A2
+#define JIT_R3  _A3
+#define JIT_R4  _A4
+#define JIT_R5  _A5
+#define JIT_R6  _A6
+#define JIT_R7  _A7
+
+// Use this as a CARRY
+#define JIT_CARRY  _T0
+#define JIT_TMP0 _T1
+#define JIT_TMP1 _T2
+#define JIT_TMP2 _T3
+
+#define JIT_TMP3 _T4
+// Temporaries
+#define JIT_TMP4 _T5
+#define JIT_TMP5 _T6
+
+// Callee-save registers
+#define JIT_V0  _S1
+#define JIT_V1  _S2
+#define JIT_V2  _S3
+#define JIT_V3  _S4
+#define JIT_V4  _S5
+#define JIT_V5  _S6
+#define JIT_V6  _S7
+#define JIT_V7  _S8
+#define JIT_V8  _S9
+#define JIT_V9  _S10
+#define JIT_V10 _S11
+
+
+// Callee-save floating point registers
+#define JIT_VF0  _FS0
+#define JIT_VF1  _FS1
+#define JIT_VF2  _FS2
+#define JIT_VF3  _FS3
+#define JIT_VF4  _FS4
+#define JIT_VF5  _FS5
+#define JIT_VF6  _FS6
+#define JIT_VF7  _FS7
+#define JIT_VF8  _FS8
+#define JIT_VF9  _FS9
+#define JIT_VF10 _FS10
+#define JIT_VF11 _FS11
+
+// Caller save floating point registers
+#define JIT_F0   _FA0
+#define JIT_F1   _FA1
+#define JIT_F2   _FA2
+#define JIT_F3   _FA3
+#define JIT_F4   _FA4
+#define JIT_F5   _FA5
+#define JIT_F6   _FA6
+#define JIT_F7   _FA7
+// NOTE: These are temporaries, but we can use them as general purpose
+// registers as there's only one temporary JIT_FTMP supported by lightening.c
+#define JIT_F8   _FT0
+#define JIT_F9   _FT1
+#define JIT_F10  _FT2
+#define JIT_F11  _FT3
+#define JIT_F12  _FT4
+#define JIT_F13  _FT5
+#define JIT_F14  _FT6
+#define JIT_F15  _FT7
+#define JIT_F16  _FT8
+#define JIT_F17  _FT9
+#define JIT_F18  _FT10
+
+// Floating point temporary register
+#define JIT_FTMP _FT11
+
+// Special purpose registers
+#define JIT_FP   _FP
+#define JIT_LR   _RA
+#define JIT_SP   _SP
+
+// TODO: Make sure this is correct
+#define JIT_PLATFORM_CALLEE_SAVE_GPRS JIT_LR
+
+#endif



reply via email to

[Prev in Thread] Current Thread [Next in Thread]