rvv: add integer/fixed-point/mask/reduction/permutation instructions

based on v-spec 0.7.1, support sections: 12/13/15.1 ~ 15.2/16/17 element size: 8/16/32/64 support ediv: 1 Signed-off-by: Bruce Hoult <bruce@hoult.org> Signed-off-by: Chih-Min Chao <chihmin.chao@sifive.com> Signed-off-by: Dave Wen <dave.wen@sifive.com>
author: Chih-Min Chao <chihmin.chao@sifive.com> 2019-06-06 03:24:27 -0700
committer: Chih-Min Chao <chihmin.chao@sifive.com> 2019-06-18 08:56:11 -0700
commit: 655aedc0ebd2326d69d389bc714c2d622bf2cb08 (patch)
tree: aa2cf79905906cde9ff6d10c63d1499fb4a484a1
parent: 235aa58bfb439c9782defe8bdd21f792e40aac31 (diff)
download: spike-655aedc0ebd2326d69d389bc714c2d622bf2cb08.zip
spike-655aedc0ebd2326d69d389bc714c2d622bf2cb08.tar.gz
spike-655aedc0ebd2326d69d389bc714c2d622bf2cb08.tar.bz2
217 files changed, 3473 insertions, 0 deletions
diff --git a/riscv/decode.h b/riscv/decode.h
index 6cbf934..ca6a999 100644
--- a/riscv/decode.h
+++ b/riscv/decode.h
@@ -11,6 +11,7 @@
 # error spike requires a little-endian host
 #endif
 
+#include <algorithm>
 #include <cstdint>
 #include <string.h>
 #include <strings.h>
@@ -23,6 +24,8 @@
 
 typedef int64_t sreg_t;
 typedef uint64_t reg_t;
+typedef __int128 int128_t;
+typedef unsigned __int128 uint128_t;
 
 const int NXPR = 32;
 const int NFPR = 32;
@@ -63,6 +66,12 @@ const int NCSR = 4096;
 #define MAX_INSN_LENGTH 8
 #define PC_ALIGN 2
 
+#ifndef TAIL_ZEROING
+  #define TAIL_ZEROING true
+#else
+  #define TAIL_ZEROING false
+#endif
+
 typedef uint64_t insn_bits_t;
 class insn_t
 {
@@ -141,8 +150,10 @@ private:
 #define P (*p)
 #define READ_REG(reg) STATE.XPR[reg]
 #define READ_FREG(reg) STATE.FPR[reg]
+#define RD READ_REG(insn.rd())
 #define RS1 READ_REG(insn.rs1())
 #define RS2 READ_REG(insn.rs2())
+#define RS3 READ_REG(insn.rs3())
 #define WRITE_RD(value) WRITE_REG(insn.rd(), value)
 
 #ifndef RISCV_ENABLE_COMMITLOG
@@ -288,6 +299,1036 @@ inline freg_t f128_negate(freg_t a)
     throw trap_illegal_instruction(0); \
   (which); })
 
+/* For debug only. This will fail if the native machine's float types are not IEEE */
+inline float to_f(float32_t f){float r; memcpy(&r, &f, sizeof(r)); return r;}
+inline double to_f(float64_t f){double r; memcpy(&r, &f, sizeof(r)); return r;}
+inline long double to_f(float128_t f){long double r; memcpy(&r, &f, sizeof(r)); return r;}
+
+// Vector macros
+#define e8 8      // 8b elements
+#define e16 16    // 16b elements
+#define e32 32    // 32b elements
+#define e64 64    // 64b elements
+#define e128 128  // 128b elements
+
+#define vsext(x, sew) (((sreg_t)(x) << (64-sew)) >> (64-sew))
+#define vzext(x, sew) (((reg_t)(x) << (64-sew)) >> (64-sew))
+
+//
+// vector: masking skip helper
+//
+#define VI_LOOP_ELEMENT_SKIP(BODY) \
+  const int mlen = P.VU.vmlen; \
+  const int midx = (mlen * i) / 64; \
+  const int mpos = (mlen * i) % 64; \
+  if (insn.v_vm() == 0) { \
+    BODY; \
+    bool skip = ((P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1) == 0; \
+    if (skip) \
+      continue; \
+  }
+
+#define VI_ELEMENT_SKIP(inx) \
+  if (inx >= vl && TAIL_ZEROING) { \
+    is_valid = false; \
+  } else if (inx >= vl && !TAIL_ZEROING) { \
+    continue; \
+  } else if (inx < P.VU.vstart) { \
+    continue; \
+  } else { \
+    VI_LOOP_ELEMENT_SKIP(); \
+  }
+
+//
+// vector: operation and register acccess check helper
+//
+static inline bool is_overlaped(const int astart, const int asize,
+                                const int bstart, const int bsize)
+{
+  const int aend = astart + asize;
+  const int bend = bstart + bsize;
+  return std::max(aend, bend) - std::min(astart, bstart) < asize + bsize;
+}
+
+#define VI_NARROW_CHECK_COMMON \
+  require(P.VU.vlmul <= 4); \
+  require(P.VU.vsew * 2 <= P.VU.ELEN); \
+  require(insn.rs2() + P.VU.vlmul * 2 <= 32);
+
+#define VI_WIDE_CHECK_COMMON \
+  require(!P.VU.vill);\
+  require(P.VU.vlmul <= 4); \
+  require(P.VU.vsew * 2 <= P.VU.ELEN); \
+  require(insn.rd() + P.VU.vlmul * 2 <= 32); \
+  if (insn.v_vm() == 0) \
+    require(insn.rd() != 0);
+
+#define VI_CHECK_VREG_OVERLAP(v1, v2) \
+  require(!is_overlaped(v1, P.VU.vlmul, v2, P.VU.vlmul));
+
+#define VI_CHECK_SS \
+  require(!is_overlaped(insn.rd(), P.VU.vlmul, insn.rs2(), P.VU.vlmul));
+
+#define VI_CHECK_SD \
+  require(!is_overlaped(insn.rd(), P.VU.vlmul, insn.rs2(), P.VU.vlmul * 2));
+
+#define VI_CHECK_DSS(is_rs) \
+  VI_WIDE_CHECK_COMMON; \
+  require(!is_overlaped(insn.rd(), P.VU.vlmul * 2, insn.rs2(), P.VU.vlmul)); \
+  if (is_rs) \
+     require(!is_overlaped(insn.rd(), P.VU.vlmul * 2, insn.rs1(), P.VU.vlmul));
+
+#define VI_CHECK_DDS(is_rs) \
+  VI_WIDE_CHECK_COMMON; \
+  require(insn.rs2() + P.VU.vlmul * 2 <= 32); \
+  if (is_rs) \
+     require(!is_overlaped(insn.rd(), P.VU.vlmul * 2, insn.rs1(), P.VU.vlmul));
+
+//
+// vector: loop header and end helper
+//
+#define VI_GENERAL_LOOP_BASE \
+  require(P.VU.vsew == e8 || P.VU.vsew == e16 || P.VU.vsew == e32 || P.VU.vsew == e64); \
+  require(!P.VU.vill);\
+  reg_t vl = P.VU.vl; \
+  reg_t sew = P.VU.vsew; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ 
+
+#define VI_TAIL_ZERO(elm) \
+  if (vl != 0 && vl < P.VU.vlmax && TAIL_ZEROING) { \
+    uint8_t *tail = &P.VU.elt<uint8_t>(rd_num, vl * ((sew >> 3) * elm)); \
+    memset(tail, 0, (P.VU.vlmax - vl) * ((sew >> 3) * elm)); \
+  }
+
+#define VI_TAIL_ZERO_MASK(dst) \
+  if (vl != 0 && TAIL_ZEROING){ \
+    for (reg_t i=vl; i<P.VU.vlmax; ++i){ \
+      const int mlen = P.VU.vmlen; \
+      const int midx = (mlen * i) / 64; \
+      const int mpos = (mlen * i) % 64; \
+      uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos); \
+      uint64_t &vdi = P.VU.elt<uint64_t>(dst, midx); \
+      vdi = (vdi & ~mmask);\
+    }\
+  }\
+
+#define VI_LOOP_BASE \
+    VI_GENERAL_LOOP_BASE \
+    VI_LOOP_ELEMENT_SKIP();
+
+#define VI_LOOP_END \
+  } \
+  if (vl != 0 && vl < P.VU.vlmax && TAIL_ZEROING){ \
+    uint8_t *tail = &P.VU.elt<uint8_t>(rd_num, vl * ((sew >> 3) * 1)); \
+    memset(tail, 0, (P.VU.vlmax - vl) * ((sew >> 3) * 1)); \
+  }\
+  P.VU.vstart = 0;
+
+#define VI_LOOP_END_NO_TAIL_ZERO \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_LOOP_WIDEN_END \
+  } \
+  if (vl != 0 && vl < P.VU.vlmax && TAIL_ZEROING){ \
+    uint8_t *tail = &P.VU.elt<uint8_t>(rd_num, vl * ((sew >> 3) * 2)); \
+    memset(tail, 0, (P.VU.vlmax - vl) * ((sew >> 3) * 2)); \
+  }\
+  P.VU.vstart = 0;
+
+#define VI_LOOP_REDUCTION_END(x) \
+  } \
+  if (vl > 0 && TAIL_ZEROING) { \
+    vd_0_des = vd_0_res; \
+    uint8_t *tail = (uint8_t *)&P.VU.elt<type_sew_t<x>::type>(rd_num, 1); \
+    memset(tail, 0, (P.VU.get_vlen() - x) >> 3); \
+  } \
+  P.VU.vstart = 0; 
+
+#define VI_LOOP_CMP_BASE \
+  require(P.VU.vsew == e8 || P.VU.vsew == e16 || P.VU.vsew == e32 || P.VU.vsew == e64); \
+  require(!P.VU.vill);\
+  reg_t vl = P.VU.vl; \
+  reg_t sew = P.VU.vsew; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos); \
+    uint64_t &vdi = P.VU.elt<uint64_t>(insn.rd(), midx); \
+    uint64_t res = 0;
+
+#define VI_LOOP_CMP_END \
+    vdi = (vdi & ~mmask) | (((res) << mpos) & mmask); \
+  } \
+  VI_TAIL_ZERO_MASK(rd_num); \
+  P.VU.vstart = 0;
+
+#define VI_LOOP_MASK(op) \
+  require(P.VU.vsew <= e64); \
+  reg_t vl = P.VU.vl; \
+  for (reg_t i = P.VU.vstart; i < vl; ++i) { \
+    int mlen = P.VU.vmlen; \
+    int midx = (mlen * i) / 64; \
+    int mpos = (mlen * i) % 64; \
+    uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos); \
+    uint64_t vs2 = P.VU.elt<uint64_t>(insn.rs2(), midx); \
+    uint64_t vs1 = P.VU.elt<uint64_t>(insn.rs1(), midx); \
+    uint64_t &res = P.VU.elt<uint64_t>(insn.rd(), midx); \
+    res = (res & ~mmask) | ((op) & (1ULL << mpos)); \
+  } \
+  \
+  if (TAIL_ZEROING) {\
+  for (reg_t i = vl; i < P.VU.vlmax && i > 0; ++i) { \
+    int mlen = P.VU.vmlen; \
+    int midx = (mlen * i) / 64; \
+    int mpos = (mlen * i) % 64; \
+    uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos); \
+    uint64_t &res = P.VU.elt<uint64_t>(insn.rd(), midx); \
+    res = (res & ~mmask); \
+    } \
+  } \
+  P.VU.vstart = 0;
+
+#define VI_LOOP_NSHIFT_BASE \
+  require(P.VU.vsew <= e32); \
+  if (insn.rd() != 0){ \
+    VI_CHECK_SD; \
+  } \
+  VI_GENERAL_LOOP_BASE; \
+  VI_LOOP_ELEMENT_SKIP({\
+    require(!(insn.rd() == 0 && P.VU.vlmul > 1));\
+  });
+
+
+#define INT_ROUNDING(result, xrm, gb) \
+  if (gb > 0) { \
+    switch(xrm) {\
+      case VRM::RNU:\
+        result += ((uint64_t)1 << ((gb) - 1));\
+        break;\
+      case VRM::RNE:\
+        if ((result & ((uint64_t)0x3 << ((gb) - 1))) == 0x1){\
+            result -= ((uint64_t)1 << ((gb) - 1));\
+            }else if ((result & ((uint64_t)0x3 << ((gb) - 1))) == 0x3){\
+            result += ((uint64_t)1 << ((gb) - 1));\
+        }\
+        break;\
+      case VRM::RDN:\
+        result = (result >> ((gb) - 1)) << ((gb) - 1);\
+        break;\
+      case VRM::ROD:\
+        result |= ((uint64_t)1ul << (gb)); \
+        break;\
+      case VRM::INVALID_RM:\
+        assert(true);\
+    } \
+  } else if (gb == 0 && xrm == VRM::ROD) { \
+    result |= 1ul; \
+  }
+
+
+//
+// vector: integer and masking operand access helper
+//
+#define VXI_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i); \
+  type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+  type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
+  type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5();
+
+#define VV_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i); \
+  type_usew_t<x>::type vs1 = P.VU.elt<type_usew_t<x>::type>(rs1_num, i); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VX_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i); \
+  type_usew_t<x>::type rs1 = (type_usew_t<x>::type)RS1; \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VI_U_PARAMS(x) \
+  type_usew_t<x>::type &vd = P.VU.elt<type_usew_t<x>::type>(rd_num, i); \
+  type_usew_t<x>::type simm5 = (type_usew_t<x>::type)insn.v_zimm5(); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define VV_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i); \
+  type_sew_t<x>::type vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define VX_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i); \
+  type_sew_t<x>::type rs1 = (type_sew_t<x>::type)RS1; \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define VI_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i); \
+  type_sew_t<x>::type simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
+  type_sew_t<x>::type vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i);
+
+#define XV_PARAMS(x) \
+  type_sew_t<x>::type &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i); \
+  type_usew_t<x>::type vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, RS1);
+
+#define VI_XI_SLIDEDOWN_PARAMS(x, off) \
+  auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i + off);
+
+#define VI_XI_SLIDEUP_PARAMS(x, offset) \
+  auto &vd = P.VU.elt<type_sew_t<x>::type>(rd_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i - offset);
+
+#define VI_NSHIFT_PARAMS(sew1, sew2) \
+  auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i); \
+  auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
+  auto zimm5 = (type_usew_t<sew1>::type)insn.v_zimm5();
+
+#define VX_NSHIFT_PARAMS(sew1, sew2) \
+  auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i); \
+  auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
+  auto rs1 = (type_sew_t<sew1>::type)RS1;
+
+#define VV_NSHIFT_PARAMS(sew1, sew2) \
+  auto &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i); \
+  auto vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
+  auto vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i);
+
+#define XI_CARRY_PARAMS(x) \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+  auto rs1 = (type_sew_t<x>::type)RS1; \
+  auto simm5 = (type_sew_t<x>::type)insn.v_simm5(); \
+  auto &vd = P.VU.elt<uint64_t>(rd_num, midx);
+
+#define VV_CARRY_PARAMS(x) \
+  auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+  auto vs1 = P.VU.elt<type_sew_t<x>::type>(rs1_num, i); \
+  auto &vd = P.VU.elt<uint64_t>(rd_num, midx);
+
+//
+// vector: integer and masking operation loop
+//
+
+// comparision result to masking register
+#define VI_VV_LOOP_CMP(BODY) \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VV_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VX_LOOP_CMP(BODY) \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VX_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VI_LOOP_CMP(BODY) \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VI_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VI_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VI_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VI_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VV_ULOOP_CMP(BODY) \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VV_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VX_ULOOP_CMP(BODY) \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VX_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+#define VI_VI_ULOOP_CMP(BODY) \
+  VI_LOOP_CMP_BASE \
+  if (sew == e8){ \
+    VI_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VI_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VI_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VI_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_CMP_END
+
+// merge and copy loop
+#define VI_VVXI_MERGE_LOOP(BODY) \
+  VI_GENERAL_LOOP_BASE \
+  if (sew == e8){ \
+    VXI_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VXI_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VXI_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VXI_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END 
+
+// reduction loop - signed
+#define VI_LOOP_REDUCTION_BASE(x) \
+  require(x == e8 || x == e16 || x == e32 || x == e64); \
+  require(!P.VU.vill);\
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  auto &vd_0_des = P.VU.elt<type_sew_t<x>::type>(rd_num, 0); \
+  auto vd_0_res = P.VU.elt<type_sew_t<x>::type>(rs1_num, 0); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    auto vs2 = P.VU.elt<type_sew_t<x>::type>(rs2_num, i); \
+
+#define REDUCTION_LOOP(x, BODY) \
+  VI_LOOP_REDUCTION_BASE(x) \
+  BODY; \
+  VI_LOOP_REDUCTION_END(x)
+
+#define VI_VV_LOOP_REDUCTION(BODY) \
+  reg_t sew = P.VU.vsew; \
+  if (sew == e8) { \
+    REDUCTION_LOOP(e8, BODY) \
+  } else if(sew == e16) { \
+    REDUCTION_LOOP(e16, BODY) \
+  } else if(sew == e32) { \
+    REDUCTION_LOOP(e32, BODY) \
+  } else if(sew == e64) { \
+    REDUCTION_LOOP(e64, BODY) \
+  }
+
+// reduction loop - unsgied
+#define VI_ULOOP_REDUCTION_BASE(x) \
+  require(x == e8 || x == e16 || x == e32 || x == e64); \
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  auto &vd_0_des = P.VU.elt<type_usew_t<x>::type>(rd_num, 0); \
+  auto vd_0_res = P.VU.elt<type_usew_t<x>::type>(rs1_num, 0); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    auto vs2 = P.VU.elt<type_usew_t<x>::type>(rs2_num, i);
+
+#define REDUCTION_ULOOP(x, BODY) \
+  VI_ULOOP_REDUCTION_BASE(x) \
+  BODY; \
+  VI_LOOP_REDUCTION_END(x)
+
+#define VI_VV_ULOOP_REDUCTION(BODY) \
+  reg_t sew = P.VU.vsew; \
+  if (sew == e8){ \
+    REDUCTION_ULOOP(e8, BODY) \
+  } else if(sew == e16) { \
+    REDUCTION_ULOOP(e16, BODY) \
+  } else if(sew == e32) { \
+    REDUCTION_ULOOP(e32, BODY) \
+  } else if(sew == e64) { \
+    REDUCTION_ULOOP(e64, BODY) \
+  }
+
+// genearl VXI signed/unsgied loop
+#define VI_VV_ULOOP(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VV_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END 
+
+#define VI_VV_LOOP(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VV_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END 
+
+#define VI_VX_ULOOP(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VX_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END 
+
+#define VI_VX_LOOP(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VX_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END 
+
+#define VI_VI_ULOOP(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_U_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VI_U_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VI_U_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VI_U_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END 
+
+#define VI_VI_LOOP(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VI_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VI_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VI_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_END 
+
+// narrow operation loop
+#define VI_VV_LOOP_NARROW(BODY) \
+VI_NARROW_CHECK_COMMON; \
+VI_LOOP_BASE \
+if (sew == e8){ \
+  VI_NARROW_SHIFT(e8, e16) \
+  BODY; \
+}else if(sew == e16){ \
+  VI_NARROW_SHIFT(e16, e32) \
+  BODY; \
+}else if(sew == e32){ \
+  VI_NARROW_SHIFT(e32, e64) \
+  BODY; \
+} \
+VI_LOOP_END 
+
+#define VI_NARROW_SHIFT(sew1, sew2) \
+  type_usew_t<sew1>::type &vd = P.VU.elt<type_usew_t<sew1>::type>(rd_num, i); \
+  type_usew_t<sew2>::type vs2_u = P.VU.elt<type_usew_t<sew2>::type>(rs2_num, i); \
+  type_usew_t<sew1>::type zimm5 = (type_usew_t<sew1>::type)insn.v_zimm5(); \
+  type_sew_t<sew2>::type vs2 = P.VU.elt<type_sew_t<sew2>::type>(rs2_num, i); \
+  type_sew_t<sew1>::type vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i); \
+  type_sew_t<sew1>::type rs1 = (type_sew_t<sew1>::type)RS1; 
+
+#define VI_VVXI_LOOP_NARROW(BODY) \
+  require(P.VU.vsew <= e32); \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_NARROW_SHIFT(e8, e16) \
+    BODY; \
+  } else if (sew == e16) { \
+    VI_NARROW_SHIFT(e16, e32) \
+    BODY; \
+  } else if (sew == e32) { \
+    VI_NARROW_SHIFT(e32, e64) \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VI_LOOP_NSHIFT(BODY) \
+  VI_LOOP_NSHIFT_BASE \
+  if (sew == e8){ \
+    VI_NSHIFT_PARAMS(e8, e16) \
+    BODY; \
+  } else if (sew == e16) { \
+    VI_NSHIFT_PARAMS(e16, e32) \
+    BODY; \
+  } else if (sew == e32) { \
+    VI_NSHIFT_PARAMS(e32, e64) \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VX_LOOP_NSHIFT(BODY) \
+  VI_LOOP_NSHIFT_BASE \
+  if (sew == e8){ \
+    VX_NSHIFT_PARAMS(e8, e16) \
+    BODY; \
+  } else if (sew == e16) { \
+    VX_NSHIFT_PARAMS(e16, e32) \
+    BODY; \
+  } else if (sew == e32) { \
+    VX_NSHIFT_PARAMS(e32, e64) \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+#define VI_VV_LOOP_NSHIFT(BODY) \
+  VI_LOOP_NSHIFT_BASE \
+  if (sew == e8){ \
+    VV_NSHIFT_PARAMS(e8, e16) \
+    BODY; \
+  } else if (sew == e16) { \
+    VV_NSHIFT_PARAMS(e16, e32) \
+    BODY; \
+  } else if (sew == e32) { \
+    VV_NSHIFT_PARAMS(e32, e64) \
+    BODY; \
+  } \
+  VI_LOOP_END
+
+// widen operation loop
+#define VI_VV_LOOP_WIDEN(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VV_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VV_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VV_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VV_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_WIDEN_END 
+
+#define VI_VX_LOOP_WIDEN(BODY) \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VX_PARAMS(e8); \
+    BODY; \
+  }else if(sew == e16){ \
+    VX_PARAMS(e16); \
+    BODY; \
+  }else if(sew == e32){ \
+    VX_PARAMS(e32); \
+    BODY; \
+  }else if(sew == e64){ \
+    VX_PARAMS(e64); \
+    BODY; \
+  } \
+  VI_LOOP_WIDEN_END 
+
+#define VI_WIDE_OP_AND_ASSIGN(var0, var1, var2, op0, op1, sign) \
+  switch(P.VU.vsew) { \
+  case e8: { \
+    sign##16_t vd_w = P.VU.elt<sign##16_t>(rd_num, i); \
+    P.VU.elt<uint16_t>(rd_num, i) = \
+      op1((sign##16_t)(sign##8_t)var0 op0 (sign##16_t)(sign##8_t)var1) + var2; \
+    } \
+    break; \
+  case e16: { \
+    sign##32_t vd_w = P.VU.elt<sign##32_t>(rd_num, i); \
+    P.VU.elt<uint32_t>(rd_num, i) = \
+      op1((sign##32_t)(sign##16_t)var0 op0 (sign##32_t)(sign##16_t)var1) + var2; \
+    } \
+    break; \
+  default: { \
+    sign##64_t vd_w = P.VU.elt<sign##64_t>(rd_num, i); \
+    P.VU.elt<uint64_t>(rd_num, i) = \
+      op1((sign##64_t)(sign##32_t)var0 op0 (sign##64_t)(sign##32_t)var1) + var2; \
+    } \
+    break; \
+  }
+
+#define VI_WIDE_OP_AND_ASSIGN_MIX(var0, var1, var2, op0, op1, sign_d, sign_1, sign_2) \
+  switch(P.VU.vsew) { \
+  case e8: { \
+    sign_d##16_t vd_w = P.VU.elt<sign_d##16_t>(rd_num, i); \
+    P.VU.elt<uint16_t>(rd_num, i) = \
+      op1((sign_1##16_t)(sign_1##8_t)var0 op0 (sign_2##16_t)(sign_2##8_t)var1) + var2; \
+    } \
+    break; \
+  case e16: { \
+    sign_d##32_t vd_w = P.VU.elt<sign_d##32_t>(rd_num, i); \
+    P.VU.elt<uint32_t>(rd_num, i) = \
+      op1((sign_1##32_t)(sign_1##16_t)var0 op0 (sign_2##32_t)(sign_2##16_t)var1) + var2; \
+    } \
+    break; \
+  default: { \
+    sign_d##64_t vd_w = P.VU.elt<sign_d##64_t>(rd_num, i); \
+    P.VU.elt<uint64_t>(rd_num, i) = \
+      op1((sign_1##64_t)(sign_1##32_t)var0 op0 (sign_2##64_t)(sign_2##32_t)var1) + var2; \
+    } \
+    break; \
+  }
+
+#define VI_WIDE_WVX_OP(var0, op0, sign) \
+  switch(P.VU.vsew) { \
+  case e8: { \
+    sign##16_t &vd_w = P.VU.elt<sign##16_t>(rd_num, i); \
+    sign##16_t vs2_w = P.VU.elt<sign##16_t>(rs2_num, i); \
+    vd_w = vs2_w op0 (sign##16_t)(sign##8_t)var0; \
+    } \
+    break; \
+  case e16: { \
+    sign##32_t &vd_w = P.VU.elt<sign##32_t>(rd_num, i); \
+    sign##32_t vs2_w = P.VU.elt<sign##32_t>(rs2_num, i); \
+    vd_w = vs2_w op0 (sign##32_t)(sign##16_t)var0; \
+    } \
+    break; \
+  default: { \
+    sign##64_t &vd_w = P.VU.elt<sign##64_t>(rd_num, i); \
+    sign##64_t vs2_w = P.VU.elt<sign##64_t>(rs2_num, i); \
+    vd_w = vs2_w op0 (sign##64_t)(sign##32_t)var0; \
+    } \
+    break; \
+  }
+
+#define VI_WIDE_SSMA(sew1, sew2, opd) \
+  auto &vd = P.VU.elt<type_sew_t<sew2>::type>(rd_num, i); \
+  auto vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<sew1>::type>(rs2_num, i); \
+  auto rs1 = (type_sew_t<sew1>::type)RS1; \
+  int##sew2##_t res; \
+  bool sat = false; \
+  const int gb = sew1 / 2; \
+  VRM vrm = P.VU.get_vround_mode(); \
+  res = (int##sew2##_t)vs2 * (int##sew2##_t)opd; \
+  INT_ROUNDING(res, vrm, gb); \
+  res = res >> gb; \
+  vd = sat_add<int##sew2##_t, uint##sew2##_t>(vd, res, sat); \
+  P.VU.vxsat |= sat;
+
+#define VI_VVX_LOOP_WIDE_SSMA(opd) \
+  VI_WIDE_CHECK_COMMON \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_WIDE_SSMA(8, 16, opd); \
+  } else if(sew == e16){ \
+    VI_WIDE_SSMA(16, 32, opd); \
+  } else if(sew == e32){ \
+    VI_WIDE_SSMA(32, 64, opd); \
+  } \
+  VI_LOOP_WIDEN_END
+
+#define VI_WIDE_USSMA(sew1, sew2, opd) \
+  auto &vd = P.VU.elt<type_usew_t<sew2>::type>(rd_num, i); \
+  auto vs1 = P.VU.elt<type_usew_t<sew1>::type>(rs1_num, i); \
+  auto vs2 = P.VU.elt<type_usew_t<sew1>::type>(rs2_num, i); \
+  auto rs1 = (type_usew_t<sew1>::type)RS1; \
+  uint##sew2##_t res; \
+  bool sat = false; \
+  const int gb = sew1 / 2; \
+  VRM vrm = P.VU.get_vround_mode(); \
+  res = (uint##sew2##_t)vs2 * (uint##sew2##_t)opd; \
+  INT_ROUNDING(res, vrm, gb); \
+  \
+  res = res >> gb; \
+  vd = sat_addu<uint##sew2##_t>(vd, res, sat); \
+  P.VU.vxsat |= sat;
+
+#define VI_VVX_LOOP_WIDE_USSMA(opd) \
+  VI_WIDE_CHECK_COMMON \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_WIDE_USSMA(8, 16, opd); \
+  } else if(sew == e16){ \
+    VI_WIDE_USSMA(16, 32, opd); \
+  } else if(sew == e32){ \
+    VI_WIDE_USSMA(32, 64, opd); \
+  } \
+  VI_LOOP_WIDEN_END
+
+#define VI_WIDE_SU_SSMA(sew1, sew2, opd) \
+  auto &vd = P.VU.elt<type_sew_t<sew2>::type>(rd_num, i); \
+  auto vs1 = P.VU.elt<type_sew_t<sew1>::type>(rs1_num, i); \
+  auto vs2 = P.VU.elt<type_usew_t<sew1>::type>(rs2_num, i); \
+  auto rs1 = (type_sew_t<sew1>::type)RS1; \
+  int##sew2##_t res; \
+  bool sat = false; \
+  const int gb = sew1 / 2; \
+  VRM vrm = P.VU.get_vround_mode(); \
+  res = (uint##sew2##_t)vs2 * (int##sew2##_t)opd; \
+  INT_ROUNDING(res, vrm, gb); \
+  \
+  res = res >> gb; \
+  vd = sat_sub<int##sew2##_t, uint##sew2##_t>(vd, res, sat); \
+  P.VU.vxsat |= sat;
+
+#define VI_VVX_LOOP_WIDE_SU_SSMA(opd) \
+  VI_WIDE_CHECK_COMMON \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_WIDE_SU_SSMA(8, 16, opd); \
+  } else if(sew == e16){ \
+    VI_WIDE_SU_SSMA(16, 32, opd); \
+  } else if(sew == e32){ \
+    VI_WIDE_SU_SSMA(32, 64, opd); \
+  } \
+  VI_LOOP_WIDEN_END
+
+#define VI_WIDE_US_SSMA(sew1, sew2, opd) \
+  auto &vd = P.VU.elt<type_sew_t<sew2>::type>(rd_num, i); \
+  auto vs1 = P.VU.elt<type_usew_t<sew1>::type>(rs1_num, i); \
+  auto vs2 = P.VU.elt<type_sew_t<sew1>::type>(rs2_num, i); \
+  auto rs1 = (type_usew_t<sew1>::type)RS1; \
+  int##sew2##_t res; \
+  bool sat = false; \
+  const int gb = sew1 / 2; \
+  VRM vrm = P.VU.get_vround_mode(); \
+  res = (int##sew2##_t)vs2 * (uint##sew2##_t)opd; \
+  INT_ROUNDING(res, vrm, gb); \
+  \
+  res = res >> gb; \
+  vd = sat_sub<int##sew2##_t, uint##sew2##_t>(vd, res, sat); \
+  P.VU.vxsat |= sat;
+
+#define VI_VVX_LOOP_WIDE_US_SSMA(opd) \
+  VI_WIDE_CHECK_COMMON \
+  VI_LOOP_BASE \
+  if (sew == e8){ \
+    VI_WIDE_US_SSMA(8, 16, opd); \
+  } else if(sew == e16){ \
+    VI_WIDE_US_SSMA(16, 32, opd); \
+  } else if(sew == e32){ \
+    VI_WIDE_US_SSMA(32, 64, opd); \
+  } \
+  VI_LOOP_WIDEN_END
+
+// wide reduction loop - signed
+#define VI_LOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
+  VI_CHECK_DSS(false); \
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  auto &vd_0_des = P.VU.elt<type_sew_t<sew2>::type>(rd_num, 0); \
+  auto vd_0_res = P.VU.elt<type_sew_t<sew2>::type>(rs1_num, 0); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i){ \
+    VI_LOOP_ELEMENT_SKIP(); \
+    auto vs2 = P.VU.elt<type_sew_t<sew1>::type>(rs2_num, i);
+
+#define WIDE_REDUCTION_LOOP(sew1, sew2, BODY) \
+  VI_LOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
+  BODY; \
+  VI_LOOP_REDUCTION_END(sew2)
+
+#define VI_VV_LOOP_WIDE_REDUCTION(BODY) \
+  require(!P.VU.vill);\
+  reg_t sew = P.VU.vsew; \
+  if (sew == e8){ \
+    WIDE_REDUCTION_LOOP(e8, e16, BODY) \
+  } else if(sew == e16){ \
+    WIDE_REDUCTION_LOOP(e16, e32, BODY) \
+  } else if(sew == e32){ \
+    WIDE_REDUCTION_LOOP(e32, e64, BODY) \
+  }
+
+// wide reduction loop - unsigned
+#define VI_ULOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
+  VI_CHECK_DSS(false); \
+  reg_t vl = P.VU.vl; \
+  reg_t rd_num = insn.rd(); \
+  reg_t rs1_num = insn.rs1(); \
+  reg_t rs2_num = insn.rs2(); \
+  auto &vd_0_des = P.VU.elt<type_usew_t<sew2>::type>(rd_num, 0); \
+  auto vd_0_res = P.VU.elt<type_usew_t<sew2>::type>(rs1_num, 0); \
+  for (reg_t i=P.VU.vstart; i<vl; ++i) { \
+    VI_LOOP_ELEMENT_SKIP(); \
+    auto vs2 = P.VU.elt<type_usew_t<sew1>::type>(rs2_num, i);
+
+#define WIDE_REDUCTION_ULOOP(sew1, sew2, BODY) \
+  VI_ULOOP_WIDE_REDUCTION_BASE(sew1, sew2) \
+  BODY; \
+  VI_LOOP_REDUCTION_END(sew2)
+
+#define VI_VV_ULOOP_WIDE_REDUCTION(BODY) \
+  require(!P.VU.vill);\
+  reg_t sew = P.VU.vsew; \
+  if (sew == e8){ \
+    WIDE_REDUCTION_ULOOP(e8, e16, BODY) \
+  } else if(sew == e16){ \
+    WIDE_REDUCTION_ULOOP(e16, e32, BODY) \
+  } else if(sew == e32){ \
+    WIDE_REDUCTION_ULOOP(e32, e64, BODY) \
+  }
+
+// carry/borrow bit loop
+#define VI_VV_LOOP_CARRY(BODY) \
+  VI_LOOP_BASE \
+    if (sew == e8){ \
+      VV_CARRY_PARAMS(e8) \
+      BODY; \
+    } else if (sew == e16) { \
+      VV_CARRY_PARAMS(e16) \
+      BODY; \
+    } else if (sew == e32) { \
+      VV_CARRY_PARAMS(e32) \
+      BODY; \
+    } else if (sew == e64) { \
+      VV_CARRY_PARAMS(e64) \
+      BODY; \
+    } \
+  } \
+  VI_TAIL_ZERO_MASK(rd_num);
+
+#define VI_XI_LOOP_CARRY(BODY) \
+  VI_LOOP_BASE \
+    if (sew == e8){ \
+      XI_CARRY_PARAMS(e8) \
+      BODY; \
+    } else if (sew == e16) { \
+      XI_CARRY_PARAMS(e16) \
+      BODY; \
+    } else if (sew == e32) { \
+      XI_CARRY_PARAMS(e32) \
+      BODY; \
+    } else if (sew == e64) { \
+      XI_CARRY_PARAMS(e64) \
+      BODY; \
+    } \
+  } \
+  VI_TAIL_ZERO_MASK(rd_num);
+
+// average loop
+#define VI_VVX_LOOP_AVG(opd, op) \
+VRM xrm = p->VU.get_vround_mode(); \
+VI_LOOP_BASE \
+  switch(sew) { \
+    case e8: { \
+     VV_PARAMS(e8); \
+     type_sew_t<e8>::type rs1 = RS1; \
+     auto res = (int32_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    case e16: { \
+     VV_PARAMS(e16); \
+     type_sew_t<e16>::type rs1 = RS1; \
+     auto res = (int32_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    case e32: { \
+     VV_PARAMS(e32); \
+     type_sew_t<e32>::type rs1 = RS1; \
+     auto res = (int64_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+    default: { \
+     VV_PARAMS(e64); \
+     type_sew_t<e64>::type rs1 = RS1; \
+     auto res = (int128_t)vs2 op opd; \
+     INT_ROUNDING(res, xrm, 1); \
+     vd = res >> 1; \
+     break; \
+    } \
+  } \
+VI_LOOP_END
 // Seems that 0x0 doesn't work.
 #define DEBUG_START             0x100
 #define DEBUG_END                 (0x1000 - 1)
diff --git a/riscv/insns/vaadd_vi.h b/riscv/insns/vaadd_vi.h
new file mode 100644
index 0000000..5f8d5f5
--- /dev/null
+++ b/riscv/insns/vaadd_vi.h
@@ -0,0 +1,9 @@
+// vaadd: Averaging adds of integers
+VRM xrm = P.VU.get_vround_mode();
+VI_VI_LOOP
+({
+  int64_t result = simm5 + vs2;
+  INT_ROUNDING(result, xrm, 1);
+  result = vzext(result >> 1, sew);
+  vd = result;
+})
diff --git a/riscv/insns/vaadd_vv.h b/riscv/insns/vaadd_vv.h
new file mode 100644
index 0000000..b479970
--- /dev/null
+++ b/riscv/insns/vaadd_vv.h
@@ -0,0 +1,2 @@
+// vaadd.vv vd, vs2, vs1
+VI_VVX_LOOP_AVG(vs1, +);
diff --git a/riscv/insns/vaadd_vx.h b/riscv/insns/vaadd_vx.h
new file mode 100644
index 0000000..c811a0a
--- /dev/null
+++ b/riscv/insns/vaadd_vx.h
@@ -0,0 +1,2 @@
+// vaadd.vx vd, vs2, rs1
+VI_VVX_LOOP_AVG(rs1, +);
diff --git a/riscv/insns/vadc_vim.h b/riscv/insns/vadc_vim.h
new file mode 100644
index 0000000..e21e2f8
--- /dev/null
+++ b/riscv/insns/vadc_vim.h
@@ -0,0 +1,11 @@
+// vadc.vim vd, vs2, simm5
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_VI_LOOP
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & simm5) + (op_mask & vs2) + carry;
+  vd = res;
+})
diff --git a/riscv/insns/vadc_vvm.h b/riscv/insns/vadc_vvm.h
new file mode 100644
index 0000000..b708ac1
--- /dev/null
+++ b/riscv/insns/vadc_vvm.h
@@ -0,0 +1,11 @@
+// vadc.vvm vd, vs2, rs1
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_VV_LOOP
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & vs1) + (op_mask & vs2) + carry;
+  vd = res;
+})
diff --git a/riscv/insns/vadc_vxm.h b/riscv/insns/vadc_vxm.h
new file mode 100644
index 0000000..6c6e6dc
--- /dev/null
+++ b/riscv/insns/vadc_vxm.h
@@ -0,0 +1,11 @@
+// vadc.vxm vd, vs2, rs1
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_VX_LOOP
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & rs1) + (op_mask & vs2) + carry;
+  vd = res;
+})
diff --git a/riscv/insns/vadd_vi.h b/riscv/insns/vadd_vi.h
new file mode 100644
index 0000000..45fc6b7
--- /dev/null
+++ b/riscv/insns/vadd_vi.h
@@ -0,0 +1,5 @@
+// vadd.vi vd, simm5, vs2, vm
+VI_VI_LOOP
+({
+  vd = simm5 + vs2;
+})
diff --git a/riscv/insns/vadd_vv.h b/riscv/insns/vadd_vv.h
new file mode 100644
index 0000000..45c6bdc
--- /dev/null
+++ b/riscv/insns/vadd_vv.h
@@ -0,0 +1,5 @@
+// vadd.vv vd, vs1, vs2, vm
+VI_VV_LOOP
+({
+  vd = vs1 + vs2;
+})
diff --git a/riscv/insns/vadd_vx.h b/riscv/insns/vadd_vx.h
new file mode 100644
index 0000000..33e72ee
--- /dev/null
+++ b/riscv/insns/vadd_vx.h
@@ -0,0 +1,5 @@
+// vadd.vx vd, rs1, vs2, vm
+VI_VX_LOOP
+({
+  vd = rs1 + vs2;
+})
diff --git a/riscv/insns/vand_vi.h b/riscv/insns/vand_vi.h
new file mode 100644
index 0000000..dd9618b
--- /dev/null
+++ b/riscv/insns/vand_vi.h
@@ -0,0 +1,5 @@
+// vand.vi vd, simm5, vs2, vm
+VI_VI_LOOP
+({
+  vd = simm5 & vs2;
+})
diff --git a/riscv/insns/vand_vv.h b/riscv/insns/vand_vv.h
new file mode 100644
index 0000000..65558e4
--- /dev/null
+++ b/riscv/insns/vand_vv.h
@@ -0,0 +1,5 @@
+// vand.vv vd, vs1, vs2, vm
+VI_VV_LOOP
+({
+  vd = vs1 & vs2;
+})
diff --git a/riscv/insns/vand_vx.h b/riscv/insns/vand_vx.h
new file mode 100644
index 0000000..8eea1ed
--- /dev/null
+++ b/riscv/insns/vand_vx.h
@@ -0,0 +1,5 @@
+// vand.vx vd, rs1, vs2, vm
+VI_VX_LOOP
+({
+  vd = rs1 & vs2;
+})
diff --git a/riscv/insns/vasub_vv.h b/riscv/insns/vasub_vv.h
new file mode 100644
index 0000000..5a5ccc9
--- /dev/null
+++ b/riscv/insns/vasub_vv.h
@@ -0,0 +1,2 @@
+// vasub.vv vd, vs2, vs1
+VI_VVX_LOOP_AVG(vs1, -);
diff --git a/riscv/insns/vasub_vx.h b/riscv/insns/vasub_vx.h
new file mode 100644
index 0000000..c3cad4b
--- /dev/null
+++ b/riscv/insns/vasub_vx.h
@@ -0,0 +1,2 @@
+// vasub.vx vd, vs2, rs1
+VI_VVX_LOOP_AVG(rs1, -);
diff --git a/riscv/insns/vcompress_vm.h b/riscv/insns/vcompress_vm.h
new file mode 100644
index 0000000..2e0784c
--- /dev/null
+++ b/riscv/insns/vcompress_vm.h
@@ -0,0 +1,41 @@
+// vcompress vd, vs2, vs1
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+require(P.VU.vstart == 0);
+reg_t sew = P.VU.vsew;
+reg_t vl = P.VU.vl;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+reg_t pos = 0;
+for (reg_t i = P.VU.vstart ; i < vl; ++i) {
+  const int mlen = P.VU.vmlen;
+  const int midx = (mlen * i) / 64;
+  const int mpos = (mlen * i) % 64;
+
+  bool do_mask = (P.VU.elt<uint64_t>(rs1_num, midx) >> mpos) & 0x1;
+  if (do_mask) {
+    switch (sew) {
+    case e8:
+      P.VU.elt<uint8_t>(rd_num, pos) = P.VU.elt<uint8_t>(rs2_num, i);
+      break;
+    case e16:
+      P.VU.elt<uint16_t>(rd_num, pos) = P.VU.elt<uint16_t>(rs2_num, i);
+      break;
+    case e32:
+      P.VU.elt<uint32_t>(rd_num, pos) = P.VU.elt<uint32_t>(rs2_num, i);
+      break;
+    default:
+      P.VU.elt<uint64_t>(rd_num, pos) = P.VU.elt<uint64_t>(rs2_num, i);
+      break;
+    }
+
+    ++pos;
+  }
+}
+
+if (vl > 0 && TAIL_ZEROING) {
+  uint8_t *tail = &P.VU.elt<uint8_t>(rd_num, pos * ((sew >> 3) * 1));
+  memset(tail, 0, (P.VU.vlmax - pos) * ((sew >> 3) * 1));
+}
+
diff --git a/riscv/insns/vdiv_vv.h b/riscv/insns/vdiv_vv.h
new file mode 100644
index 0000000..67da162
--- /dev/null
+++ b/riscv/insns/vdiv_vv.h
@@ -0,0 +1,10 @@
+// vdiv.vv vd, vs2, vs1
+VI_VV_LOOP
+({
+  if (vs1 == 0)
+    vd = -1;
+  else if (vs2 == -(1 << (sew - 1)) && vs1 == -1)
+    vd = vs2;
+  else
+    vd = vs2 / vs1;
+})
diff --git a/riscv/insns/vdiv_vx.h b/riscv/insns/vdiv_vx.h
new file mode 100644
index 0000000..1a152bd
--- /dev/null
+++ b/riscv/insns/vdiv_vx.h
@@ -0,0 +1,10 @@
+// vdiv.vx vd, vs2, rs1
+VI_VX_LOOP
+({
+  if(rs1 == 0)
+    vd = -1;
+  else if(vs2 == -(1 << (sew - 1)) && rs1 == -1)
+    vd = vs2;
+  else
+    vd = vs2 / rs1;
+})
diff --git a/riscv/insns/vdivu_vv.h b/riscv/insns/vdivu_vv.h
new file mode 100644
index 0000000..ef6e777
--- /dev/null
+++ b/riscv/insns/vdivu_vv.h
@@ -0,0 +1,8 @@
+// vdivu.vv vd, vs2, vs1
+VI_VV_ULOOP
+({
+  if(vs1 == 0)
+    vd = -1;
+  else
+    vd = vs2 / vs1;
+})
diff --git a/riscv/insns/vdivu_vx.h b/riscv/insns/vdivu_vx.h
new file mode 100644
index 0000000..7ffe1c6
--- /dev/null
+++ b/riscv/insns/vdivu_vx.h
@@ -0,0 +1,8 @@
+// vdivu.vx vd, vs2, rs1
+VI_VX_ULOOP
+({
+  if(rs1 == 0)
+    vd = -1;
+  else
+    vd = vs2 / rs1;
+})
diff --git a/riscv/insns/vdot_vv.h b/riscv/insns/vdot_vv.h
new file mode 100644
index 0000000..7685230
--- /dev/null
+++ b/riscv/insns/vdot_vv.h
@@ -0,0 +1,5 @@
+// vdot vd, vs2, vs1
+VI_VV_LOOP
+({
+  vd += vs2 * vs1;
+})
diff --git a/riscv/insns/vdotu_vv.h b/riscv/insns/vdotu_vv.h
new file mode 100644
index 0000000..9c4c59d
--- /dev/null
+++ b/riscv/insns/vdotu_vv.h
@@ -0,0 +1,5 @@
+// vdotu vd, vs2, vs1
+VI_VV_ULOOP
+({
+  vd += vs2 * vs1;
+})
diff --git a/riscv/insns/vext_x_v.h b/riscv/insns/vext_x_v.h
new file mode 100644
index 0000000..837cc22
--- /dev/null
+++ b/riscv/insns/vext_x_v.h
@@ -0,0 +1,30 @@
+// vext_x_v: rd = vs2[rs1]
+require(insn.v_vm() == 1);
+uint64_t xmask = UINT64_MAX >> (64 - P.get_max_xlen());
+reg_t rs1 = RS1;
+VI_LOOP_BASE
+VI_LOOP_END_NO_TAIL_ZERO
+if (!(rs1 >= 0 && rs1 < (P.VU.get_vlen()/sew))) {
+  WRITE_RD(0);
+} else {
+  switch(sew) {
+  case e8:
+    WRITE_RD(P.VU.elt<uint8_t>(rs2_num, rs1));
+    break;
+  case e16:
+    WRITE_RD(P.VU.elt<uint16_t>(rs2_num, rs1));
+    break;
+  case e32:
+    if (P.get_max_xlen() == 32)
+      WRITE_RD(P.VU.elt<int32_t>(rs2_num, rs1));
+    else
+      WRITE_RD(P.VU.elt<uint32_t>(rs2_num, rs1));
+    break;
+  case e64:
+    if (P.get_max_xlen() <= sew)
+      WRITE_RD(P.VU.elt<uint64_t>(rs2_num, rs1) & xmask);
+    else
+      WRITE_RD(P.VU.elt<uint64_t>(rs2_num, rs1));
+    break;
+  }
+}
diff --git a/riscv/insns/vid_v.h b/riscv/insns/vid_v.h
new file mode 100644
index 0000000..ac111d0
--- /dev/null
+++ b/riscv/insns/vid_v.h
@@ -0,0 +1,30 @@
+// vmpopc rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+
+for (reg_t i = P.VU.vstart ; i < P.VU.vl; ++i) {
+  VI_LOOP_ELEMENT_SKIP();
+
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, i) = i;
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, i) = i;
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, i) = i;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i) = i;
+    break;
+  }
+}
+
+VI_TAIL_ZERO(1);
+P.VU.vstart = 0;
diff --git a/riscv/insns/viota_m.h b/riscv/insns/viota_m.h
new file mode 100644
index 0000000..c7b831a
--- /dev/null
+++ b/riscv/insns/viota_m.h
@@ -0,0 +1,52 @@
+// vmpopc rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+require(P.VU.vstart == 0);
+
+int cnt = 0;
+for (reg_t i = 0; i < vl; ++i) {
+  const int mlen = P.VU.vmlen;
+  const int midx = (mlen * i) / 64;
+  const int mpos = (mlen * i) % 64;
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx) >> mpos) & 0x1) == 1;
+  bool do_mask = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  bool has_one = false;
+  if (insn.v_vm() == 1 || (insn.v_vm() == 0 && do_mask)) {
+    if (vs2_lsb) {
+      has_one = true;
+    }
+  }
+
+  bool use_ori = (insn.v_vm() == 0) && !do_mask;
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, i) = use_ori ?
+                                   P.VU.elt<uint8_t>(rd_num, i) : cnt;
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, i) = use_ori ?
+                                    P.VU.elt<uint16_t>(rd_num, i) : cnt;
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, i) = use_ori ?
+                                    P.VU.elt<uint32_t>(rd_num, i) : cnt;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i) = use_ori ?
+                                    P.VU.elt<uint64_t>(rd_num, i) : cnt;
+    break;
+  }
+
+  if (has_one) {
+    cnt++;
+  }
+}
+
+VI_TAIL_ZERO(1);
diff --git a/riscv/insns/vmacc_vv.h b/riscv/insns/vmacc_vv.h
new file mode 100644
index 0000000..e6ec93f
--- /dev/null
+++ b/riscv/insns/vmacc_vv.h
@@ -0,0 +1,5 @@
+// vmacc.vv: vd[i] = +(vs1[i] * vs2[i]) + vd[i]
+VI_VV_LOOP
+({
+  vd = vs1 * vs2 + vd;
+})
diff --git a/riscv/insns/vmacc_vx.h b/riscv/insns/vmacc_vx.h
new file mode 100644
index 0000000..d40b264
--- /dev/null
+++ b/riscv/insns/vmacc_vx.h
@@ -0,0 +1,5 @@
+// vmacc.vx: vd[i] = +(x[rs1] * vs2[i]) + vd[i]
+VI_VX_LOOP
+({
+  vd = rs1 * vs2 + vd;
+})
diff --git a/riscv/insns/vmadc_vim.h b/riscv/insns/vmadc_vim.h
new file mode 100644
index 0000000..fd79089
--- /dev/null
+++ b/riscv/insns/vmadc_vim.h
@@ -0,0 +1,14 @@
+// vmadc.vim vd, vs2, simm5
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_XI_LOOP_CARRY
+({
+  auto v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & simm5) + (op_mask & vs2) + carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmadc_vvm.h b/riscv/insns/vmadc_vvm.h
new file mode 100644
index 0000000..82042ca
--- /dev/null
+++ b/riscv/insns/vmadc_vvm.h
@@ -0,0 +1,14 @@
+// vmadc.vvm vd, vs2, rs1
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_VV_LOOP_CARRY
+({
+  auto v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & vs1) + (op_mask & vs2) + carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmadc_vxm.h b/riscv/insns/vmadc_vxm.h
new file mode 100644
index 0000000..8f26584
--- /dev/null
+++ b/riscv/insns/vmadc_vxm.h
@@ -0,0 +1,14 @@
+// vadc.vx vd, vs2, rs1
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_XI_LOOP_CARRY
+({
+  auto v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & rs1) + (op_mask & vs2) + carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmadd_vv.h b/riscv/insns/vmadd_vv.h
new file mode 100644
index 0000000..a1c0d2e
--- /dev/null
+++ b/riscv/insns/vmadd_vv.h
@@ -0,0 +1,5 @@
+// vmadd: vd[i] = (vd[i] * vs1[i]) + vs2[i]
+VI_VV_LOOP
+({
+  vd = vd * vs1 + vs2;
+})
diff --git a/riscv/insns/vmadd_vx.h b/riscv/insns/vmadd_vx.h
new file mode 100644
index 0000000..1a8a001
--- /dev/null
+++ b/riscv/insns/vmadd_vx.h
@@ -0,0 +1,5 @@
+// vmadd: vd[i] = (vd[i] * x[rs1]) + vs2[i]
+VI_VX_LOOP
+({
+  vd = vd * rs1 + vs2;
+})
diff --git a/riscv/insns/vmand_mm.h b/riscv/insns/vmand_mm.h
new file mode 100644
index 0000000..04615c6
--- /dev/null
+++ b/riscv/insns/vmand_mm.h
@@ -0,0 +1,2 @@
+// vmand.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 & vs1);
diff --git a/riscv/insns/vmandnot_mm.h b/riscv/insns/vmandnot_mm.h
new file mode 100644
index 0000000..4c26469
--- /dev/null
+++ b/riscv/insns/vmandnot_mm.h
@@ -0,0 +1,2 @@
+// vmandnot.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 & ~vs1);
diff --git a/riscv/insns/vmax_vv.h b/riscv/insns/vmax_vv.h
new file mode 100644
index 0000000..b9f15c5
--- /dev/null
+++ b/riscv/insns/vmax_vv.h
@@ -0,0 +1,10 @@
+// vmax.vv vd, vs2, vs1, vm   # Vector-vector
+VI_VV_LOOP
+({
+  if (vs1 >= vs2) {
+    vd = vs1;
+  } else {
+    vd = vs2;
+  }
+
+})
diff --git a/riscv/insns/vmax_vx.h b/riscv/insns/vmax_vx.h
new file mode 100644
index 0000000..06f3f43
--- /dev/null
+++ b/riscv/insns/vmax_vx.h
@@ -0,0 +1,10 @@
+// vmax.vx vd, vs2, rs1, vm   # vector-scalar
+VI_VX_LOOP
+({
+  if (rs1 >= vs2) {
+    vd = rs1;
+  } else {
+    vd = vs2;
+  }
+
+})
diff --git a/riscv/insns/vmaxu_vv.h b/riscv/insns/vmaxu_vv.h
new file mode 100644
index 0000000..4e6868d
--- /dev/null
+++ b/riscv/insns/vmaxu_vv.h
@@ -0,0 +1,9 @@
+// vmaxu.vv vd, vs2, vs1, vm   # Vector-vector
+VI_VV_ULOOP
+({
+  if (vs1 >= vs2) {
+    vd = vs1;
+  } else {
+    vd = vs2;
+  }
+})
diff --git a/riscv/insns/vmaxu_vx.h b/riscv/insns/vmaxu_vx.h
new file mode 100644
index 0000000..cab8918
--- /dev/null
+++ b/riscv/insns/vmaxu_vx.h
@@ -0,0 +1,9 @@
+// vmaxu.vx vd, vs2, rs1, vm   # vector-scalar
+VI_VX_ULOOP
+({
+  if (rs1 >= vs2) {
+    vd = rs1;
+  } else {
+    vd = vs2;
+  }
+})
diff --git a/riscv/insns/vmerge_vim.h b/riscv/insns/vmerge_vim.h
new file mode 100644
index 0000000..13354d6
--- /dev/null
+++ b/riscv/insns/vmerge_vim.h
@@ -0,0 +1,9 @@
+// vmerge.vim vd, vs2, simm5
+VI_VVXI_MERGE_LOOP
+({
+  int midx = (P.VU.vmlen * i) / 64;
+  int mpos = (P.VU.vmlen * i) % 64;
+  bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  vd = use_first ? simm5 : vs2;
+})
diff --git a/riscv/insns/vmerge_vvm.h b/riscv/insns/vmerge_vvm.h
new file mode 100644
index 0000000..7530b40
--- /dev/null
+++ b/riscv/insns/vmerge_vvm.h
@@ -0,0 +1,9 @@
+// vmerge.vvm vd, vs2, vs1
+VI_VVXI_MERGE_LOOP
+({
+  int midx = (P.VU.vmlen * i) / 64;
+  int mpos = (P.VU.vmlen * i) % 64;
+  bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  vd = use_first ? vs1 : vs2;
+})
diff --git a/riscv/insns/vmerge_vxm.h b/riscv/insns/vmerge_vxm.h
new file mode 100644
index 0000000..b1757fa
--- /dev/null
+++ b/riscv/insns/vmerge_vxm.h
@@ -0,0 +1,9 @@
+// vmerge.vxm vd, vs2, rs1
+VI_VVXI_MERGE_LOOP
+({
+  int midx = (P.VU.vmlen * i) / 64;
+  int mpos = (P.VU.vmlen * i) % 64;
+  bool use_first = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+
+  vd = use_first ? rs1 : vs2;
+})
diff --git a/riscv/insns/vmfeq_vf.h b/riscv/insns/vmfeq_vf.h
new file mode 100644
index 0000000..cedf4b9
--- /dev/null
+++ b/riscv/insns/vmfeq_vf.h
@@ -0,0 +1,5 @@
+// vfeq.vf vd, vs2, fs1
+VI_VFP_LOOP_CMP
+({
+  res = f32_eq(vs2, rs1);
+})
diff --git a/riscv/insns/vmfeq_vv.h b/riscv/insns/vmfeq_vv.h
new file mode 100644
index 0000000..7e76cac
--- /dev/null
+++ b/riscv/insns/vmfeq_vv.h
@@ -0,0 +1,5 @@
+// vfeq.vv vd, vs2, vs1
+VI_VFP_LOOP_CMP
+({
+  res = f32_eq(vs2, vs1);
+})
diff --git a/riscv/insns/vmfge_vf.h b/riscv/insns/vmfge_vf.h
new file mode 100644
index 0000000..7eade89
--- /dev/null
+++ b/riscv/insns/vmfge_vf.h
@@ -0,0 +1,5 @@
+// vfge.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f32_le_quiet(rs1, vs2);
+})
diff --git a/riscv/insns/vmfgt_vf.h b/riscv/insns/vmfgt_vf.h
new file mode 100644
index 0000000..6115d06
--- /dev/null
+++ b/riscv/insns/vmfgt_vf.h
@@ -0,0 +1,5 @@
+// vfgt.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f32_lt_quiet(rs1, vs2);
+})
diff --git a/riscv/insns/vmfirst_m.h b/riscv/insns/vmfirst_m.h
new file mode 100644
index 0000000..8c216c0
--- /dev/null
+++ b/riscv/insns/vmfirst_m.h
@@ -0,0 +1,20 @@
+// vmfirst rd, vs2
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs2_num = insn.rs2();
+require(P.VU.vstart == 0);
+reg_t pos = -1;
+for (reg_t i=P.VU.vstart; i < vl; ++i) {
+  VI_LOOP_ELEMENT_SKIP()
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx ) >> mpos) & 0x1) == 1;
+  if (vs2_lsb) {
+    pos = i;
+    break;
+  }
+}
+P.VU.vstart = 0;
+WRITE_RD(pos);
diff --git a/riscv/insns/vmfle_vf.h b/riscv/insns/vmfle_vf.h
new file mode 100644
index 0000000..998b93b
--- /dev/null
+++ b/riscv/insns/vmfle_vf.h
@@ -0,0 +1,5 @@
+// vfle.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f32_le(vs2, rs1);
+})
diff --git a/riscv/insns/vmfle_vv.h b/riscv/insns/vmfle_vv.h
new file mode 100644
index 0000000..c716312
--- /dev/null
+++ b/riscv/insns/vmfle_vv.h
@@ -0,0 +1,5 @@
+// vfle.vv vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f32_le_quiet(vs2, vs1);
+})
diff --git a/riscv/insns/vmflt_vf.h b/riscv/insns/vmflt_vf.h
new file mode 100644
index 0000000..af436e4
--- /dev/null
+++ b/riscv/insns/vmflt_vf.h
@@ -0,0 +1,5 @@
+// vflt.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = f32_lt_quiet(vs2, rs1);
+})
diff --git a/riscv/insns/vmflt_vv.h b/riscv/insns/vmflt_vv.h
new file mode 100644
index 0000000..ded867d
--- /dev/null
+++ b/riscv/insns/vmflt_vv.h
@@ -0,0 +1,5 @@
+// vflt.vv vd, vs2, vs1
+VI_VFP_LOOP_CMP
+({
+  res = f32_lt_quiet(vs2, vs1);
+})
diff --git a/riscv/insns/vmfne_vf.h b/riscv/insns/vmfne_vf.h
new file mode 100644
index 0000000..ac2eced
--- /dev/null
+++ b/riscv/insns/vmfne_vf.h
@@ -0,0 +1,5 @@
+// vfne.vf vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = !f32_eq(vs2, rs1);
+})
diff --git a/riscv/insns/vmfne_vv.h b/riscv/insns/vmfne_vv.h
new file mode 100644
index 0000000..3fa8beb
--- /dev/null
+++ b/riscv/insns/vmfne_vv.h
@@ -0,0 +1,5 @@
+// vfne.vv vd, vs2, rs1
+VI_VFP_LOOP_CMP
+({
+  res = !f32_eq(vs2, vs1);
+})
diff --git a/riscv/insns/vmford_vf.h b/riscv/insns/vmford_vf.h
new file mode 100644
index 0000000..b5e74f2
--- /dev/null
+++ b/riscv/insns/vmford_vf.h
@@ -0,0 +1,5 @@
+// vford.vf vd, vs2, rs1, vm
+VI_VFP_LOOP_CMP
+({
+  res = !(f32_isSignalingNaN(vs2) || f32_isSignalingNaN(rs1));
+})
diff --git a/riscv/insns/vmford_vv.h b/riscv/insns/vmford_vv.h
new file mode 100644
index 0000000..2e459c1
--- /dev/null
+++ b/riscv/insns/vmford_vv.h
@@ -0,0 +1,5 @@
+// vford.vv vd, vs2, vs1, vm
+VI_VFP_LOOP_CMP
+({
+  res = !(f32_isSignalingNaN(vs2) || f32_isSignalingNaN(vs1));
+})
diff --git a/riscv/insns/vmin_vv.h b/riscv/insns/vmin_vv.h
new file mode 100644
index 0000000..21da0b3
--- /dev/null
+++ b/riscv/insns/vmin_vv.h
@@ -0,0 +1,11 @@
+// vmin.vv vd, vs2, vs1, vm   # Vector-vector
+VI_VV_LOOP
+({
+  if (vs1 <= vs2) {
+    vd = vs1;
+  } else {
+    vd = vs2;
+  }
+
+
+})
diff --git a/riscv/insns/vmin_vx.h b/riscv/insns/vmin_vx.h
new file mode 100644
index 0000000..3291776
--- /dev/null
+++ b/riscv/insns/vmin_vx.h
@@ -0,0 +1,11 @@
+// vminx.vx vd, vs2, rs1, vm   # vector-scalar
+VI_VX_LOOP
+({
+  if (rs1 <= vs2) {
+    vd = rs1;
+  } else {
+    vd = vs2;
+  }
+
+
+})
diff --git a/riscv/insns/vminu_vv.h b/riscv/insns/vminu_vv.h
new file mode 100644
index 0000000..c0ab195
--- /dev/null
+++ b/riscv/insns/vminu_vv.h
@@ -0,0 +1,9 @@
+// vminu.vv vd, vs2, vs1, vm   # Vector-vector
+VI_VV_ULOOP
+({
+  if (vs1 <= vs2) {
+    vd = vs1;
+  } else {
+    vd = vs2;
+  }
+})
diff --git a/riscv/insns/vminu_vx.h b/riscv/insns/vminu_vx.h
new file mode 100644
index 0000000..1055895
--- /dev/null
+++ b/riscv/insns/vminu_vx.h
@@ -0,0 +1,10 @@
+// vminu.vx vd, vs2, rs1, vm   # vector-scalar
+VI_VX_ULOOP
+({
+  if (rs1 <= vs2) {
+    vd = rs1;
+  } else {
+    vd = vs2;
+  }
+
+})
diff --git a/riscv/insns/vmnand_mm.h b/riscv/insns/vmnand_mm.h
new file mode 100644
index 0000000..5a3ab09
--- /dev/null
+++ b/riscv/insns/vmnand_mm.h
@@ -0,0 +1,2 @@
+// vmnand.mm vd, vs2, vs1
+VI_LOOP_MASK(~(vs2 & vs1));
diff --git a/riscv/insns/vmnor_mm.h b/riscv/insns/vmnor_mm.h
new file mode 100644
index 0000000..ab93378
--- /dev/null
+++ b/riscv/insns/vmnor_mm.h
@@ -0,0 +1,2 @@
+// vmnor.mm vd, vs2, vs1
+VI_LOOP_MASK(~(vs2 | vs1));
diff --git a/riscv/insns/vmor_mm.h b/riscv/insns/vmor_mm.h
new file mode 100644
index 0000000..32e71b9
--- /dev/null
+++ b/riscv/insns/vmor_mm.h
@@ -0,0 +1,2 @@
+// vmor.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 | vs1);
diff --git a/riscv/insns/vmornot_mm.h b/riscv/insns/vmornot_mm.h
new file mode 100644
index 0000000..bdc1d8b
--- /dev/null
+++ b/riscv/insns/vmornot_mm.h
@@ -0,0 +1,2 @@
+// vmornot.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 | ~vs1);
diff --git a/riscv/insns/vmpopc_m.h b/riscv/insns/vmpopc_m.h
new file mode 100644
index 0000000..9e22b2b
--- /dev/null
+++ b/riscv/insns/vmpopc_m.h
@@ -0,0 +1,24 @@
+// vmpopc rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs2_num = insn.rs2();
+require(P.VU.vstart == 0);
+reg_t popcount = 0;
+for (reg_t i=P.VU.vstart; i<vl; ++i) {
+  const int mlen = P.VU.vmlen;
+  const int midx = (mlen * i) / 32;
+  const int mpos = (mlen * i) % 32;
+
+  bool vs2_lsb = ((P.VU.elt<uint32_t>(rs2_num, midx ) >> mpos) & 0x1) == 1;
+  if (insn.v_vm() == 1) {
+    popcount += vs2_lsb;
+  } else {
+    bool do_mask = (P.VU.elt<uint32_t>(0, midx) >> mpos) & 0x1;
+    popcount += (vs2_lsb && do_mask);
+  }
+}
+P.VU.vstart = 0;
+WRITE_RD(popcount);
diff --git a/riscv/insns/vmsbc_vvm.h b/riscv/insns/vmsbc_vvm.h
new file mode 100644
index 0000000..3804ba8
--- /dev/null
+++ b/riscv/insns/vmsbc_vvm.h
@@ -0,0 +1,14 @@
+// vmsbc.vvm vd, vs2, rs1
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_VV_LOOP_CARRY
+({
+  auto v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & vs1) - (op_mask & vs2) - carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmsbc_vxm.h b/riscv/insns/vmsbc_vxm.h
new file mode 100644
index 0000000..d5332f5
--- /dev/null
+++ b/riscv/insns/vmsbc_vxm.h
@@ -0,0 +1,14 @@
+// vmsbc.vxm vd, vs2, rs1
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_XI_LOOP_CARRY
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & rs1) - (op_mask & vs2) - carry;
+
+  carry = (res >> sew) & 0x1u;
+  vd = (vd & ~mmask) | ((carry << mpos) & mmask);
+})
diff --git a/riscv/insns/vmsbf_m.h b/riscv/insns/vmsbf_m.h
new file mode 100644
index 0000000..a014900
--- /dev/null
+++ b/riscv/insns/vmsbf_m.h
@@ -0,0 +1,34 @@
+// vmsbf.m vd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+
+bool has_one = false;
+for (reg_t i = P.VU.vstart; i < vl; ++i) {
+  const int mlen = P.VU.vmlen;
+  const int midx = (mlen * i) / 64;
+  const int mpos = (mlen * i) % 64;
+  const uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos);
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx) >> mpos) & 0x1) == 1;
+  bool do_mask = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+  auto &vd = P.VU.elt<uint64_t>(rd_num, midx);
+
+
+  if (insn.v_vm() == 1 || (insn.v_vm() == 0 && do_mask)) {
+    uint64_t res = 0;
+    if (!has_one && !vs2_lsb) {
+      res = 1;
+    } else if(!has_one && vs2_lsb) {
+      has_one = true;
+    }
+    vd = (vd & ~mmask) | ((res << mpos) & mmask);
+  }
+}
+
+VI_TAIL_ZERO_MASK(rd_num);
+P.VU.vstart = 0;
diff --git a/riscv/insns/vmseq_vi.h b/riscv/insns/vmseq_vi.h
new file mode 100644
index 0000000..cfc1682
--- /dev/null
+++ b/riscv/insns/vmseq_vi.h
@@ -0,0 +1,5 @@
+// vseq.vi vd, vs2, simm5
+VI_VI_LOOP_CMP
+({
+  res = simm5 == vs2;
+})
diff --git a/riscv/insns/vmseq_vv.h b/riscv/insns/vmseq_vv.h
new file mode 100644
index 0000000..91fd204
--- /dev/null
+++ b/riscv/insns/vmseq_vv.h
@@ -0,0 +1,6 @@
+// vseq.vv vd, vs2, vs1
+VI_VV_LOOP_CMP
+({
+  res = vs2 == vs1;
+})
+
diff --git a/riscv/insns/vmseq_vx.h b/riscv/insns/vmseq_vx.h
new file mode 100644
index 0000000..ab63323
--- /dev/null
+++ b/riscv/insns/vmseq_vx.h
@@ -0,0 +1,5 @@
+// vseq.vx vd, vs2, rs1
+VI_VX_LOOP_CMP
+({
+  res = rs1 == vs2;
+})
diff --git a/riscv/insns/vmsgt_vi.h b/riscv/insns/vmsgt_vi.h
new file mode 100644
index 0000000..4f7dea8
--- /dev/null
+++ b/riscv/insns/vmsgt_vi.h
@@ -0,0 +1,5 @@
+// vsgt.vi  vd, vs2, simm5
+VI_VI_LOOP_CMP
+({
+  res = vs2 > simm5;
+})
diff --git a/riscv/insns/vmsgt_vx.h b/riscv/insns/vmsgt_vx.h
new file mode 100644
index 0000000..5f24db6
--- /dev/null
+++ b/riscv/insns/vmsgt_vx.h
@@ -0,0 +1,5 @@
+// vsgt.vx  vd, vs2, rs1
+VI_VX_LOOP_CMP
+({
+  res = vs2 > rs1;
+})
diff --git a/riscv/insns/vmsgtu_vi.h b/riscv/insns/vmsgtu_vi.h
new file mode 100644
index 0000000..268d437
--- /dev/null
+++ b/riscv/insns/vmsgtu_vi.h
@@ -0,0 +1,5 @@
+// vsgtu.vi  vd, vd2, zimm5
+VI_VI_ULOOP_CMP
+({
+  res = vs2 > simm5;
+})
diff --git a/riscv/insns/vmsgtu_vx.h b/riscv/insns/vmsgtu_vx.h
new file mode 100644
index 0000000..7f39800
--- /dev/null
+++ b/riscv/insns/vmsgtu_vx.h
@@ -0,0 +1,5 @@
+// vsgtu.vx  vd, vs2, rs1
+VI_VX_ULOOP_CMP
+({
+  res = vs2 > rs1;
+})
diff --git a/riscv/insns/vmsif_m.h b/riscv/insns/vmsif_m.h
new file mode 100644
index 0000000..144b67c
--- /dev/null
+++ b/riscv/insns/vmsif_m.h
@@ -0,0 +1,34 @@
+// vmpopc rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+
+bool has_one = false;
+for (reg_t i = P.VU.vstart ; i < vl; ++i) {
+  const int mlen = P.VU.vmlen;
+  const int midx = (mlen * i) / 64;
+  const int mpos = (mlen * i) % 64;
+  const uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos);
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx ) >> mpos) & 0x1) == 1;
+  bool do_mask = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+  auto &vd = P.VU.elt<uint64_t>(rd_num, midx);
+
+  if (insn.v_vm() == 1 || (insn.v_vm() == 0 && do_mask)) {
+    uint64_t res = 0;
+    if (!has_one && !vs2_lsb) {
+      res = 1;
+    } else if(!has_one && vs2_lsb) {
+      has_one = true;
+      res = 1;
+    }
+    vd = (vd & ~mmask) | ((res << mpos) & mmask);
+  }
+}
+
+VI_TAIL_ZERO_MASK(rd_num);
+P.VU.vstart = 0;
diff --git a/riscv/insns/vmsle_vi.h b/riscv/insns/vmsle_vi.h
new file mode 100644
index 0000000..f0f67d0
--- /dev/null
+++ b/riscv/insns/vmsle_vi.h
@@ -0,0 +1,5 @@
+// vsle.vi vd, vs2, simm5
+VI_VI_LOOP_CMP
+({
+  res = vs2 <= simm5;
+})
diff --git a/riscv/insns/vmsle_vv.h b/riscv/insns/vmsle_vv.h
new file mode 100644
index 0000000..30aba06
--- /dev/null
+++ b/riscv/insns/vmsle_vv.h
@@ -0,0 +1,5 @@
+// vsle.vv vd, vs2, vs1
+VI_VV_LOOP_CMP
+({
+  res = vs2 <= vs1;
+})
diff --git a/riscv/insns/vmsle_vx.h b/riscv/insns/vmsle_vx.h
new file mode 100644
index 0000000..c26d596
--- /dev/null
+++ b/riscv/insns/vmsle_vx.h
@@ -0,0 +1,5 @@
+// vsle.vx vd, vs2, rs1
+VI_VX_LOOP_CMP
+({
+  res = vs2 <= rs1;
+})
diff --git a/riscv/insns/vmsleu_vi.h b/riscv/insns/vmsleu_vi.h
new file mode 100644
index 0000000..dc4fd18
--- /dev/null
+++ b/riscv/insns/vmsleu_vi.h
@@ -0,0 +1,5 @@
+// vsleu.vi vd, vs2, zimm5
+VI_VI_ULOOP_CMP
+({
+  res = vs2 <= simm5;
+})
diff --git a/riscv/insns/vmsleu_vv.h b/riscv/insns/vmsleu_vv.h
new file mode 100644
index 0000000..0e46032
--- /dev/null
+++ b/riscv/insns/vmsleu_vv.h
@@ -0,0 +1,5 @@
+// vsleu.vv vd, vs2, vs1
+VI_VV_ULOOP_CMP
+({
+  res = vs2 <= vs1;
+})
diff --git a/riscv/insns/vmsleu_vx.h b/riscv/insns/vmsleu_vx.h
new file mode 100644
index 0000000..935b176
--- /dev/null
+++ b/riscv/insns/vmsleu_vx.h
@@ -0,0 +1,5 @@
+// vsleu.vx  vd, vs2, rs1
+VI_VX_ULOOP_CMP
+({
+  res = vs2 <= rs1;
+})
diff --git a/riscv/insns/vmslt_vv.h b/riscv/insns/vmslt_vv.h
new file mode 100644
index 0000000..71e6f87
--- /dev/null
+++ b/riscv/insns/vmslt_vv.h
@@ -0,0 +1,5 @@
+// vslt.vv  vd, vd2, vs1
+VI_VV_LOOP_CMP
+({
+  res = vs2 < vs1;
+})
diff --git a/riscv/insns/vmslt_vx.h b/riscv/insns/vmslt_vx.h
new file mode 100644
index 0000000..b32bb14
--- /dev/null
+++ b/riscv/insns/vmslt_vx.h
@@ -0,0 +1,5 @@
+// vslt.vx  vd, vs2, vs1
+VI_VX_LOOP_CMP
+({
+  res = vs2 < rs1;
+})
diff --git a/riscv/insns/vmsltu_vv.h b/riscv/insns/vmsltu_vv.h
new file mode 100644
index 0000000..53a570a
--- /dev/null
+++ b/riscv/insns/vmsltu_vv.h
@@ -0,0 +1,5 @@
+// vsltu.vv  vd, vs2, vs1
+VI_VV_ULOOP_CMP
+({
+  res = vs2 < vs1;
+})
diff --git a/riscv/insns/vmsltu_vx.h b/riscv/insns/vmsltu_vx.h
new file mode 100644
index 0000000..8082544
--- /dev/null
+++ b/riscv/insns/vmsltu_vx.h
@@ -0,0 +1,5 @@
+// vsltu.vx  vd, vs2, vs1
+VI_VX_ULOOP_CMP
+({
+  res = vs2 < rs1;
+})
diff --git a/riscv/insns/vmsne_vi.h b/riscv/insns/vmsne_vi.h
new file mode 100644
index 0000000..5e9758e
--- /dev/null
+++ b/riscv/insns/vmsne_vi.h
@@ -0,0 +1,5 @@
+// vsne.vi  vd, vs2, simm5
+VI_VI_LOOP_CMP
+({
+  res = vs2 != simm5;
+})
diff --git a/riscv/insns/vmsne_vv.h b/riscv/insns/vmsne_vv.h
new file mode 100644
index 0000000..e6a7174
--- /dev/null
+++ b/riscv/insns/vmsne_vv.h
@@ -0,0 +1,5 @@
+// vneq.vv  vd, vs2, vs1
+VI_VV_LOOP_CMP
+({
+  res = vs2 != vs1;
+})
diff --git a/riscv/insns/vmsne_vx.h b/riscv/insns/vmsne_vx.h
new file mode 100644
index 0000000..9e4c155
--- /dev/null
+++ b/riscv/insns/vmsne_vx.h
@@ -0,0 +1,5 @@
+// vsne.vx  vd, vs2, rs1
+VI_VX_LOOP_CMP
+({
+  res = vs2 != rs1;
+})
diff --git a/riscv/insns/vmsof_m.h b/riscv/insns/vmsof_m.h
new file mode 100644
index 0000000..b4cbbce
--- /dev/null
+++ b/riscv/insns/vmsof_m.h
@@ -0,0 +1,32 @@
+// vmsof.m rd, vs2, vm
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+
+bool has_one = false;
+for (reg_t i = P.VU.vstart ; i < vl; ++i) {
+  const int mlen = P.VU.vmlen;
+  const int midx = (mlen * i) / 64;
+  const int mpos = (mlen * i) % 64;
+  const uint64_t mmask = (UINT64_MAX << (64 - mlen)) >> (64 - mlen - mpos);
+
+  bool vs2_lsb = ((P.VU.elt<uint64_t>(rs2_num, midx ) >> mpos) & 0x1) == 1;
+  bool do_mask = (P.VU.elt<uint64_t>(0, midx) >> mpos) & 0x1;
+  uint64_t &vd = P.VU.elt<uint64_t>(rd_num, midx);
+
+  if (insn.v_vm() == 1 || (insn.v_vm() == 0 && do_mask)) {
+    uint64_t res = 0;
+    if(!has_one && vs2_lsb) {
+      has_one = true;
+      res = 1;
+    }
+    vd = (vd & ~mmask) | ((res << mpos) & mmask);
+  }
+}
+
+VI_TAIL_ZERO_MASK(rd_num);
+P.VU.vstart = 0;
diff --git a/riscv/insns/vmul_vv.h b/riscv/insns/vmul_vv.h
new file mode 100644
index 0000000..a327817
--- /dev/null
+++ b/riscv/insns/vmul_vv.h
@@ -0,0 +1,5 @@
+// vmul vd, vs2, vs1
+VI_VV_LOOP
+({
+  vd = vs2 * vs1;
+})
diff --git a/riscv/insns/vmul_vx.h b/riscv/insns/vmul_vx.h
new file mode 100644
index 0000000..8d68390
--- /dev/null
+++ b/riscv/insns/vmul_vx.h
@@ -0,0 +1,5 @@
+// vmul vd, vs2, rs1
+VI_VX_LOOP
+({
+  vd = vs2 * rs1;
+})
diff --git a/riscv/insns/vmulh_vv.h b/riscv/insns/vmulh_vv.h
new file mode 100644
index 0000000..e861a33
--- /dev/null
+++ b/riscv/insns/vmulh_vv.h
@@ -0,0 +1,5 @@
+// vmulh vd, vs2, vs1
+VI_VV_LOOP
+({
+  vd = ((int128_t)vs2 * vs1) >> sew;
+})
diff --git a/riscv/insns/vmulh_vx.h b/riscv/insns/vmulh_vx.h
new file mode 100644
index 0000000..b6b5503
--- /dev/null
+++ b/riscv/insns/vmulh_vx.h
@@ -0,0 +1,5 @@
+// vmulh vd, vs2, rs1
+VI_VX_LOOP
+({
+  vd = ((int128_t)vs2 * rs1) >> sew;
+})
diff --git a/riscv/insns/vmulhsu_vv.h b/riscv/insns/vmulhsu_vv.h
new file mode 100644
index 0000000..59882da
--- /dev/null
+++ b/riscv/insns/vmulhsu_vv.h
@@ -0,0 +1,37 @@
+// vmulhsu.vv vd, vs2, vs1
+VI_LOOP_BASE
+switch(sew) {
+case e8: {
+  auto &vd = P.VU.elt<int8_t>(rd_num, i);
+  auto vs2 = P.VU.elt<int8_t>(rs2_num, i);
+  auto vs1 = P.VU.elt<uint8_t>(rs1_num, i);
+
+  vd = ((int16_t)vs2 * (uint16_t)vs1) >> sew;
+  break;
+}
+case e16: {
+  auto &vd = P.VU.elt<int16_t>(rd_num, i);
+  auto vs2 = P.VU.elt<int16_t>(rs2_num, i);
+  auto vs1 = P.VU.elt<uint16_t>(rs1_num, i);
+
+  vd = ((int32_t)vs2 * (uint32_t)vs1) >> sew;
+  break;
+}
+case e32: {
+  auto &vd = P.VU.elt<int32_t>(rd_num, i);
+  auto vs2 = P.VU.elt<int32_t>(rs2_num, i);
+  auto vs1 = P.VU.elt<uint32_t>(rs1_num, i);
+
+  vd = ((int64_t)vs2 * (uint64_t)vs1) >> sew;
+  break;
+}
+default: {
+  auto &vd = P.VU.elt<int64_t>(rd_num, i);
+  auto vs2 = P.VU.elt<int64_t>(rs2_num, i);
+  auto vs1 = P.VU.elt<uint64_t>(rs1_num, i);
+
+  vd = ((int128_t)vs2 * (uint128_t)vs1) >> sew;
+  break;
+}
+}
+VI_LOOP_END
diff --git a/riscv/insns/vmulhsu_vx.h b/riscv/insns/vmulhsu_vx.h
new file mode 100644
index 0000000..d39615a
--- /dev/null
+++ b/riscv/insns/vmulhsu_vx.h
@@ -0,0 +1,37 @@
+// vmulhsu.vx vd, vs2, rs1
+VI_LOOP_BASE
+switch(sew) {
+case e8: {
+  auto &vd = P.VU.elt<int8_t>(rd_num, i);
+  auto vs2 = P.VU.elt<int8_t>(rs2_num, i);
+  uint8_t rs1 = RS1;
+
+  vd = ((int16_t)vs2 * (uint16_t)rs1) >> sew;
+  break;
+}
+case e16: {
+  auto &vd = P.VU.elt<int16_t>(rd_num, i);
+  auto vs2 = P.VU.elt<int16_t>(rs2_num, i);
+  uint16_t rs1 = RS1;
+
+  vd = ((int32_t)vs2 * (uint32_t)rs1) >> sew;
+  break;
+}
+case e32: {
+  auto &vd = P.VU.elt<int32_t>(rd_num, i);
+  auto vs2 = P.VU.elt<int32_t>(rs2_num, i);
+  uint32_t rs1 = RS1;
+
+  vd = ((int64_t)vs2 * (uint64_t)rs1) >> sew;
+  break;
+}
+default: {
+  auto &vd = P.VU.elt<int64_t>(rd_num, i);
+  auto vs2 = P.VU.elt<int64_t>(rs2_num, i);
+  uint64_t rs1 = RS1;
+
+  vd = ((int128_t)vs2 * (uint128_t)rs1) >> sew;
+  break;
+}
+}
+VI_LOOP_END
diff --git a/riscv/insns/vmulhu_vv.h b/riscv/insns/vmulhu_vv.h
new file mode 100644
index 0000000..8e318ed
--- /dev/null
+++ b/riscv/insns/vmulhu_vv.h
@@ -0,0 +1,5 @@
+// vmulhu vd ,vs2, vs1
+VI_VV_ULOOP
+({
+  vd = ((uint128_t)vs2 * vs1) >> sew;
+})
diff --git a/riscv/insns/vmulhu_vx.h b/riscv/insns/vmulhu_vx.h
new file mode 100644
index 0000000..672ad32
--- /dev/null
+++ b/riscv/insns/vmulhu_vx.h
@@ -0,0 +1,5 @@
+// vmulhu vd ,vs2, rs1
+VI_VX_ULOOP
+({
+  vd = ((uint128_t)vs2 * rs1) >> sew;
+})
diff --git a/riscv/insns/vmv_s_x.h b/riscv/insns/vmv_s_x.h
new file mode 100644
index 0000000..1c4ffb2
--- /dev/null
+++ b/riscv/insns/vmv_s_x.h
@@ -0,0 +1,45 @@
+// vmv_s_x: vd[0] = rs1
+require(insn.v_vm() == 1);
+require(P.VU.vsew == e8 || P.VU.vsew == e16 ||
+        P.VU.vsew == e32 || P.VU.vsew == e64);
+reg_t vl = P.VU.vl;
+
+if (vl > 0) {
+  reg_t rd_num = insn.rd();
+  reg_t sew = P.VU.vsew;
+
+  switch(sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, 0) = RS1;
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, 0) = RS1;
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, 0) = RS1;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, 0) = RS1;
+    break;
+  }
+
+  const reg_t max_len = P.VU.VLEN / sew;
+  for (reg_t i = 1; i < max_len; ++i) {
+    switch(sew) {
+    case e8:
+      P.VU.elt<uint8_t>(rd_num, i) = 0;
+      break;
+    case e16:
+      P.VU.elt<uint16_t>(rd_num, i) = 0;
+      break;
+    case e32:
+      P.VU.elt<uint32_t>(rd_num, i) = 0;
+      break;
+    default:
+      P.VU.elt<uint64_t>(rd_num, i) = 0;
+      break;
+    }
+  }
+
+  vl = 0;
+}
diff --git a/riscv/insns/vmv_v_i.h b/riscv/insns/vmv_v_i.h
new file mode 100644
index 0000000..31e9877
--- /dev/null
+++ b/riscv/insns/vmv_v_i.h
@@ -0,0 +1,5 @@
+// vmv.v.i vd, simm5
+VI_VVXI_MERGE_LOOP
+({
+  vd = simm5;
+})
diff --git a/riscv/insns/vmv_v_v.h b/riscv/insns/vmv_v_v.h
new file mode 100644
index 0000000..734010b
--- /dev/null
+++ b/riscv/insns/vmv_v_v.h
@@ -0,0 +1,5 @@
+// vvmv.v.v vd, vs1
+VI_VVXI_MERGE_LOOP
+({
+  vd = vs1;
+})
diff --git a/riscv/insns/vmv_v_x.h b/riscv/insns/vmv_v_x.h
new file mode 100644
index 0000000..4688b3f
--- /dev/null
+++ b/riscv/insns/vmv_v_x.h
@@ -0,0 +1,5 @@
+// vmv.v.x vd, rs1
+VI_VVXI_MERGE_LOOP
+({
+  vd = rs1;
+})
diff --git a/riscv/insns/vmxnor_mm.h b/riscv/insns/vmxnor_mm.h
new file mode 100644
index 0000000..0736d5b
--- /dev/null
+++ b/riscv/insns/vmxnor_mm.h
@@ -0,0 +1,2 @@
+// vmnxor.mm vd, vs2, vs1
+VI_LOOP_MASK(~(vs2 ^ vs1));
diff --git a/riscv/insns/vmxor_mm.h b/riscv/insns/vmxor_mm.h
new file mode 100644
index 0000000..7f0c576
--- /dev/null
+++ b/riscv/insns/vmxor_mm.h
@@ -0,0 +1,2 @@
+// vmxor.mm vd, vs2, vs1
+VI_LOOP_MASK(vs2 ^ vs1);
diff --git a/riscv/insns/vnclip_vi.h b/riscv/insns/vnclip_vi.h
new file mode 100644
index 0000000..ca27593
--- /dev/null
+++ b/riscv/insns/vnclip_vi.h
@@ -0,0 +1,24 @@
+// vnclip: vd[i] = clip(round(vs2[i] + rnd) >> simm)
+VRM xrm = P.VU.get_vround_mode();
+int64_t int_max = (1 << (P.VU.vsew - 1)) - 1;
+int64_t int_min = -(1 << (P.VU.vsew - 1));
+VI_VVXI_LOOP_NARROW
+({
+
+  int64_t result = vs2;
+// rounding
+  INT_ROUNDING(result, xrm, sew);
+
+  result = vsext(result, sew * 2) >> (zimm5 & ((sew * 2) < 32? (sew * 2) - 1: 31));
+
+// saturation
+  if (result < int_min) {
+    result = int_min;
+    P.VU.vxsat = 1;
+  } else if (result > int_max) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+})
diff --git a/riscv/insns/vnclip_vv.h b/riscv/insns/vnclip_vv.h
new file mode 100644
index 0000000..7bcb4cb
--- /dev/null
+++ b/riscv/insns/vnclip_vv.h
@@ -0,0 +1,30 @@
+// vnclip: vd[i] = clip(round(vs2[i] + rnd) >> vs1[i])
+VRM xrm = P.VU.get_vround_mode();
+int64_t int_max = (1 << (P.VU.vsew - 1)) - 1;
+int64_t int_min = -(1 << (P.VU.vsew - 1));
+VI_VVXI_LOOP_NARROW
+({
+
+  int64_t result = vs2;
+// rounding
+  INT_ROUNDING(result, xrm, sew);
+
+// unsigned shifting to rs1
+  uint64_t unsigned_shift_amount = (uint64_t)(vs1 & ((sew * 2) - 1));
+  if (unsigned_shift_amount >= (2 * sew)) {
+    unsigned_shift_amount = 2 * sew - 1;
+  }
+
+  result = (vsext(result, sew * 2)) >> unsigned_shift_amount;
+
+// saturation
+  if (result < int_min) {
+    result = int_min;
+    P.VU.vxsat = 1;
+  } else if (result > int_max) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+})
diff --git a/riscv/insns/vnclip_vx.h b/riscv/insns/vnclip_vx.h
new file mode 100644
index 0000000..b66e830
--- /dev/null
+++ b/riscv/insns/vnclip_vx.h
@@ -0,0 +1,29 @@
+// vnclip: vd[i] = clip(round(vs2[i] + rnd) >> rs1[i])
+VRM xrm = P.VU.get_vround_mode();
+int64_t int_max = (1 << (P.VU.vsew - 1)) - 1;
+int64_t int_min = -(1 << (P.VU.vsew - 1));
+VI_VVXI_LOOP_NARROW
+({
+
+  int64_t result = vs2;
+// rounding
+  INT_ROUNDING(result, xrm, sew);
+
+// unsigned shifting to rs1
+  uint64_t unsigned_shift_amount = (uint64_t)(rs1 & ((sew * 2) - 1));
+  if (unsigned_shift_amount >= (2 * sew)) {
+    unsigned_shift_amount = 2 * sew - 1;
+  }
+  result = vsext(result, sew * 2) >> unsigned_shift_amount;
+
+// saturation
+  if (result < int_min) {
+    result = int_min;
+    P.VU.vxsat = 1;
+  } else if (result > int_max) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+})
diff --git a/riscv/insns/vnclipu_vi.h b/riscv/insns/vnclipu_vi.h
new file mode 100644
index 0000000..61cb015
--- /dev/null
+++ b/riscv/insns/vnclipu_vi.h
@@ -0,0 +1,20 @@
+// vnclipu: vd[i] = clip(round(vs2[i] + rnd) >> simm)
+VRM xrm = P.VU.get_vround_mode();
+uint64_t int_max = ~(-1ll << P.VU.vsew);
+VI_VVXI_LOOP_NARROW
+({
+  uint64_t result = vs2_u;
+  // rounding
+  INT_ROUNDING(result, xrm, sew);
+
+  // unsigned shifting to rs1
+  result = vzext(result, sew * 2) >> (zimm5 & ((sew * 2) < 32? (sew * 2) - 1: 31));
+
+  // saturation
+  if (result & (uint64_t)(-1ll << sew)) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+})
diff --git a/riscv/insns/vnclipu_vv.h b/riscv/insns/vnclipu_vv.h
new file mode 100644
index 0000000..004f24f
--- /dev/null
+++ b/riscv/insns/vnclipu_vv.h
@@ -0,0 +1,26 @@
+// vnclipu: vd[i] = clip(round(vs2[i] + rnd) >> vs1[i])
+VRM xrm = P.VU.get_vround_mode();
+uint64_t int_max = ~(-1ll << P.VU.vsew);
+VI_VVXI_LOOP_NARROW
+({
+
+  uint64_t result = vs2_u;
+
+// rounding
+  INT_ROUNDING(result, xrm, sew);
+
+// unsigned shifting to rs1
+  uint64_t unsigned_shift_amount = (uint64_t)(vs1 & ((sew * 2) - 1));
+  if (unsigned_shift_amount >= (2 * sew)) {
+    result = 0;
+  } else {
+    result = vzext(result, sew * 2) >> unsigned_shift_amount;
+  }
+// saturation
+  if (result & (uint64_t)(-1ll << sew)) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+})
diff --git a/riscv/insns/vnclipu_vx.h b/riscv/insns/vnclipu_vx.h
new file mode 100644
index 0000000..0507a2b
--- /dev/null
+++ b/riscv/insns/vnclipu_vx.h
@@ -0,0 +1,26 @@
+// vnclipu: vd[i] = clip(round(vs2[i] + rnd) >> rs1[i])
+VRM xrm = P.VU.get_vround_mode();
+uint64_t int_max = ~(-1ll << P.VU.vsew);
+VI_VVXI_LOOP_NARROW
+({
+  uint64_t result = vs2;
+
+// rounding
+  INT_ROUNDING(result, xrm, sew);
+
+// unsigned shifting to rs1
+  uint64_t unsigned_shift_amount = (uint64_t)(rs1 & ((sew * 2) - 1));
+  if (unsigned_shift_amount >= (2 * sew)) {
+    result = 0;
+  } else {
+    result = vzext(result, sew * 2) >> unsigned_shift_amount;
+  }
+
+// saturation
+  if (result & (uint64_t)(-1ll << sew)) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  }
+
+  vd = result;
+})
diff --git a/riscv/insns/vnmsac_vv.h b/riscv/insns/vnmsac_vv.h
new file mode 100644
index 0000000..7c10f29
--- /dev/null
+++ b/riscv/insns/vnmsac_vv.h
@@ -0,0 +1,5 @@
+// vmsac.vv: vd[i] = -(vs1[i] * vs2[i]) + vd[i]
+VI_VV_LOOP
+({
+  vd = -(vs1 * vs2) + vd;
+})
diff --git a/riscv/insns/vnmsac_vx.h b/riscv/insns/vnmsac_vx.h
new file mode 100644
index 0000000..44920be
--- /dev/null
+++ b/riscv/insns/vnmsac_vx.h
@@ -0,0 +1,5 @@
+// vmsac: vd[i] = -(x[rs1] * vs2[i]) + vd[i]
+VI_VX_LOOP
+({
+  vd = -(rs1 * vs2) + vd;
+})
diff --git a/riscv/insns/vnmsub_vv.h b/riscv/insns/vnmsub_vv.h
new file mode 100644
index 0000000..37f8228
--- /dev/null
+++ b/riscv/insns/vnmsub_vv.h
@@ -0,0 +1,5 @@
+// vnmsub.vv: vd[i] = -(vd[i] * vs1[i]) + vs2[i]
+VI_VV_LOOP
+({
+  vd = -(vd * vs1) + vs2;
+})
diff --git a/riscv/insns/vnmsub_vx.h b/riscv/insns/vnmsub_vx.h
new file mode 100644
index 0000000..2e00d22
--- /dev/null
+++ b/riscv/insns/vnmsub_vx.h
@@ -0,0 +1,5 @@
+// vnmsub.vx: vd[i] = -(vd[i] * x[rs1]) + vs2[i]
+VI_VX_LOOP
+({
+  vd = -(vd * rs1) + vs2;
+})
diff --git a/riscv/insns/vnsra_vi.h b/riscv/insns/vnsra_vi.h
new file mode 100644
index 0000000..0502ff1
--- /dev/null
+++ b/riscv/insns/vnsra_vi.h
@@ -0,0 +1,5 @@
+// vnsra.vi vd, vs2, zimm5
+VI_VI_LOOP_NSHIFT
+({
+  vd = vs2 >> (zimm5 & (sew * 2 - 1) & 0x1f);
+})
diff --git a/riscv/insns/vnsra_vv.h b/riscv/insns/vnsra_vv.h
new file mode 100644
index 0000000..555ce3f
--- /dev/null
+++ b/riscv/insns/vnsra_vv.h
@@ -0,0 +1,5 @@
+// vnsra.vv vd, vs2, vs1
+VI_VV_LOOP_NSHIFT
+({
+  vd = vs2 >> (vs1 & (sew * 2 - 1));
+})
diff --git a/riscv/insns/vnsra_vx.h b/riscv/insns/vnsra_vx.h
new file mode 100644
index 0000000..05a55e3
--- /dev/null
+++ b/riscv/insns/vnsra_vx.h
@@ -0,0 +1,5 @@
+// vnsra.vx vd, vs2, rs1
+VI_VX_LOOP_NSHIFT
+({
+  vd = vs2 >> (rs1 & (sew * 2 - 1));
+})
diff --git a/riscv/insns/vnsrl_vi.h b/riscv/insns/vnsrl_vi.h
new file mode 100644
index 0000000..d4dfcf0
--- /dev/null
+++ b/riscv/insns/vnsrl_vi.h
@@ -0,0 +1,5 @@
+// vnsrl.vi vd, vs2, zimm5
+VI_VI_LOOP_NSHIFT
+({
+  vd = vs2_u >> (zimm5 & (sew * 2 - 1));
+})
diff --git a/riscv/insns/vnsrl_vv.h b/riscv/insns/vnsrl_vv.h
new file mode 100644
index 0000000..ab72b84
--- /dev/null
+++ b/riscv/insns/vnsrl_vv.h
@@ -0,0 +1,5 @@
+// vnsrl.vv vd, vs2, vs1
+VI_VV_LOOP_NSHIFT
+({
+  vd = vs2_u >> (vs1 & (sew * 2 - 1));
+})
diff --git a/riscv/insns/vnsrl_vx.h b/riscv/insns/vnsrl_vx.h
new file mode 100644
index 0000000..e149b38
--- /dev/null
+++ b/riscv/insns/vnsrl_vx.h
@@ -0,0 +1,5 @@
+// vnsrl.vx vd, vs2, rs1
+VI_VX_LOOP_NSHIFT
+({
+  vd = vs2_u >> (rs1 & (sew * 2 - 1));
+})
diff --git a/riscv/insns/vor_vi.h b/riscv/insns/vor_vi.h
new file mode 100644
index 0000000..f759607
--- /dev/null
+++ b/riscv/insns/vor_vi.h
@@ -0,0 +1,5 @@
+// vor
+VI_VI_LOOP
+({
+  vd = simm5 | vs2;
+})
diff --git a/riscv/insns/vor_vv.h b/riscv/insns/vor_vv.h
new file mode 100644
index 0000000..0c46066
--- /dev/null
+++ b/riscv/insns/vor_vv.h
@@ -0,0 +1,5 @@
+// vor
+VI_VV_LOOP
+({
+  vd = vs1 | vs2;
+})
diff --git a/riscv/insns/vor_vx.h b/riscv/insns/vor_vx.h
new file mode 100644
index 0000000..01c003a
--- /dev/null
+++ b/riscv/insns/vor_vx.h
@@ -0,0 +1,5 @@
+// vor
+VI_VX_LOOP
+({
+  vd = rs1 | vs2;
+})
diff --git a/riscv/insns/vredand_vs.h b/riscv/insns/vredand_vs.h
new file mode 100644
index 0000000..6c2d908
--- /dev/null
+++ b/riscv/insns/vredand_vs.h
@@ -0,0 +1,5 @@
+// vredand.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res &= vs2;
+})
diff --git a/riscv/insns/vredmax_vs.h b/riscv/insns/vredmax_vs.h
new file mode 100644
index 0000000..be2e76a
--- /dev/null
+++ b/riscv/insns/vredmax_vs.h
@@ -0,0 +1,5 @@
+// vredmax.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res = (vd_0_res >= vs2) ? vd_0_res : vs2;
+})
diff --git a/riscv/insns/vredmaxu_vs.h b/riscv/insns/vredmaxu_vs.h
new file mode 100644
index 0000000..960f486
--- /dev/null
+++ b/riscv/insns/vredmaxu_vs.h
@@ -0,0 +1,5 @@
+// vredmaxu.vs vd, vs2 ,vs1
+VI_VV_ULOOP_REDUCTION
+({
+  vd_0_res = (vd_0_res >= vs2) ? vd_0_res : vs2;
+})
diff --git a/riscv/insns/vredmin_vs.h b/riscv/insns/vredmin_vs.h
new file mode 100644
index 0000000..50359b7
--- /dev/null
+++ b/riscv/insns/vredmin_vs.h
@@ -0,0 +1,5 @@
+// vredmin.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res = (vd_0_res <= vs2) ? vd_0_res : vs2;
+})
diff --git a/riscv/insns/vredminu_vs.h b/riscv/insns/vredminu_vs.h
new file mode 100644
index 0000000..7082475
--- /dev/null
+++ b/riscv/insns/vredminu_vs.h
@@ -0,0 +1,5 @@
+// vredminu.vs vd, vs2 ,vs1
+VI_VV_ULOOP_REDUCTION
+({
+  vd_0_res = (vd_0_res <= vs2) ? vd_0_res : vs2;
+})
diff --git a/riscv/insns/vredor_vs.h b/riscv/insns/vredor_vs.h
new file mode 100644
index 0000000..f7acd9a
--- /dev/null
+++ b/riscv/insns/vredor_vs.h
@@ -0,0 +1,5 @@
+// vredor.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res |= vs2;
+})
diff --git a/riscv/insns/vredsum_vs.h b/riscv/insns/vredsum_vs.h
new file mode 100644
index 0000000..c4fefe5
--- /dev/null
+++ b/riscv/insns/vredsum_vs.h
@@ -0,0 +1,5 @@
+// vredsum.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res += vs2;
+})
diff --git a/riscv/insns/vredxor_vs.h b/riscv/insns/vredxor_vs.h
new file mode 100644
index 0000000..bb81ad9
--- /dev/null
+++ b/riscv/insns/vredxor_vs.h
@@ -0,0 +1,5 @@
+// vredxor.vs vd, vs2 ,vs1
+VI_VV_LOOP_REDUCTION
+({
+  vd_0_res ^= vs2;
+})
diff --git a/riscv/insns/vrem_vv.h b/riscv/insns/vrem_vv.h
new file mode 100644
index 0000000..da477f0
--- /dev/null
+++ b/riscv/insns/vrem_vv.h
@@ -0,0 +1,11 @@
+// vrem.vv vd, vs2, vs1
+VI_VV_LOOP
+({
+  if (vs1 == 0)
+    vd = vs2;
+  else if(vs2 == -(1 << (sew - 1)) && vs1 == -1)
+    vd = 0;
+  else {
+    vd = vs2 % vs1;
+  }
+})
diff --git a/riscv/insns/vrem_vx.h b/riscv/insns/vrem_vx.h
new file mode 100644
index 0000000..f068842
--- /dev/null
+++ b/riscv/insns/vrem_vx.h
@@ -0,0 +1,10 @@
+// vrem.vx vd, vs2, rs1
+VI_VX_LOOP
+({
+  if (rs1 == 0)
+    vd = vs2;
+  else if (vs2 == -(1 << (sew - 1)) && rs1 == -1)
+    vd = 0;
+  else
+    vd = vs2 % rs1;
+})
diff --git a/riscv/insns/vremu_vv.h b/riscv/insns/vremu_vv.h
new file mode 100644
index 0000000..7e15072
--- /dev/null
+++ b/riscv/insns/vremu_vv.h
@@ -0,0 +1,8 @@
+// vremu.vv vd, vs2, vs1
+VI_VV_ULOOP
+({
+  if (vs1 == 0)
+    vd = vs2;
+  else
+    vd = vs2 % vs1;
+})
diff --git a/riscv/insns/vremu_vx.h b/riscv/insns/vremu_vx.h
new file mode 100644
index 0000000..a87a820
--- /dev/null
+++ b/riscv/insns/vremu_vx.h
@@ -0,0 +1,8 @@
+// vremu.vx vd, vs2, rs1
+VI_VX_ULOOP
+({
+  if (rs1 == 0)
+    vd = vs2;
+  else
+    vd = vs2 % rs1;
+})
diff --git a/riscv/insns/vrgather_vi.h b/riscv/insns/vrgather_vi.h
new file mode 100644
index 0000000..a9be102
--- /dev/null
+++ b/riscv/insns/vrgather_vi.h
@@ -0,0 +1,29 @@
+// vrgather.vi vd, vs2, zimm5 vm # vd[i] = (zimm5 >= VLMAX) ? 0 : vs2[zimm5];
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs2_num = insn.rs2();
+reg_t zimm5 = insn.v_zimm5();
+for (reg_t i = P.VU.vstart; i < vl; ++i) {
+  VI_LOOP_ELEMENT_SKIP();
+
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, i) = zimm5 >= P.VU.vlmax ? 0 : P.VU.elt<uint8_t>(rs2_num, zimm5);
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, i) = zimm5 >= P.VU.vlmax ? 0 : P.VU.elt<uint16_t>(rs2_num, zimm5);
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, i) = zimm5 >= P.VU.vlmax ? 0 : P.VU.elt<uint32_t>(rs2_num, zimm5);
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i) = zimm5 >= P.VU.vlmax ? 0 : P.VU.elt<uint64_t>(rs2_num, zimm5);
+    break;
+  }
+}
+
+VI_TAIL_ZERO(1);
+P.VU.vstart = 0;
diff --git a/riscv/insns/vrgather_vv.h b/riscv/insns/vrgather_vv.h
new file mode 100644
index 0000000..da8dc81
--- /dev/null
+++ b/riscv/insns/vrgather_vv.h
@@ -0,0 +1,39 @@
+// vrgather.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+for (reg_t i = P.VU.vstart; i < vl; ++i) {
+  VI_LOOP_ELEMENT_SKIP();
+  VI_CHECK_VREG_OVERLAP(rd_num, rs1_num);
+  VI_CHECK_VREG_OVERLAP(rd_num, rs2_num);
+  switch (sew) {
+  case e8: {
+    auto vs1 = P.VU.elt<uint8_t>(rs1_num, i);
+    //if (i > 255) continue;
+    P.VU.elt<uint8_t>(rd_num, i) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint8_t>(rs2_num, vs1);
+    break;
+  }
+  case e16: {
+    auto vs1 = P.VU.elt<uint16_t>(rs1_num, i);
+    P.VU.elt<uint16_t>(rd_num, i) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint16_t>(rs2_num, vs1);
+    break;
+  }
+  case e32: {
+    auto vs1 = P.VU.elt<uint32_t>(rs1_num, i);
+    P.VU.elt<uint32_t>(rd_num, i) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint32_t>(rs2_num, vs1);
+    break;
+  }
+  default: {
+    auto vs1 = P.VU.elt<uint64_t>(rs1_num, i);
+    P.VU.elt<uint64_t>(rd_num, i) = vs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint64_t>(rs2_num, vs1);
+    break;
+  }
+  }
+}
+
+VI_TAIL_ZERO(1);
+P.VU.vstart = 0;
diff --git a/riscv/insns/vrgather_vx.h b/riscv/insns/vrgather_vx.h
new file mode 100644
index 0000000..d6c2e38
--- /dev/null
+++ b/riscv/insns/vrgather_vx.h
@@ -0,0 +1,30 @@
+// vrgather.vx vd, vs2, rs1, vm # vd[i] = (rs1 >= VLMAX) ? 0 : vs2[rs1];
+require(P.VU.vsew >= e8 && P.VU.vsew <= e64);
+require(!P.VU.vill);
+reg_t vl = P.VU.vl;
+reg_t sew = P.VU.vsew;
+reg_t rd_num = insn.rd();
+reg_t rs1_num = insn.rs1();
+reg_t rs2_num = insn.rs2();
+reg_t rs1 = RS1;
+for (reg_t i = P.VU.vstart; i < vl; ++i) {
+  VI_LOOP_ELEMENT_SKIP();
+
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, i) = rs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint8_t>(rs2_num, rs1);
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, i) = rs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint16_t>(rs2_num, rs1);
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, i) = rs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint32_t>(rs2_num, rs1);
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i) = rs1 >= P.VU.vlmax ? 0 : P.VU.elt<uint64_t>(rs2_num, rs1);
+    break;
+  }
+}
+
+VI_TAIL_ZERO(1);
+P.VU.vstart = 0;
diff --git a/riscv/insns/vrsub_vi.h b/riscv/insns/vrsub_vi.h
new file mode 100644
index 0000000..198c33f
--- /dev/null
+++ b/riscv/insns/vrsub_vi.h
@@ -0,0 +1,5 @@
+// vrsub.vi vd, vs2, imm, vm   # vd[i] = imm - vs2[i]
+VI_VI_LOOP
+({
+  vd = simm5 - vs2;
+})
diff --git a/riscv/insns/vrsub_vx.h b/riscv/insns/vrsub_vx.h
new file mode 100644
index 0000000..bfd6259
--- /dev/null
+++ b/riscv/insns/vrsub_vx.h
@@ -0,0 +1,5 @@
+// vrsub.vx vd, vs2, rs1, vm   # vd[i] = rs1 - vs2[i]
+VI_VX_LOOP
+({
+  vd = rs1 - vs2;
+})
diff --git a/riscv/insns/vsadd_vi.h b/riscv/insns/vsadd_vi.h
new file mode 100644
index 0000000..de2cb83
--- /dev/null
+++ b/riscv/insns/vsadd_vi.h
@@ -0,0 +1,27 @@
+// vsadd.vi vd, vs2 simm5
+VI_LOOP_BASE
+bool sat = false;
+switch(sew) {
+case e8: {
+  VI_PARAMS(e8);
+  vd = sat_add<int8_t, uint8_t>(vs2, vsext(simm5, sew), sat);
+  break;
+}
+case e16: {
+  VI_PARAMS(e16);
+  vd = sat_add<int16_t, uint16_t>(vs2, vsext(simm5, sew), sat);
+  break;
+}
+case e32: {
+  VI_PARAMS(e32);
+  vd = sat_add<int32_t, uint32_t>(vs2, vsext(simm5, sew), sat);
+  break;
+}
+default: {
+  VI_PARAMS(e64);
+  vd = sat_add<int64_t, uint64_t>(vs2, vsext(simm5, sew), sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vsadd_vv.h b/riscv/insns/vsadd_vv.h
new file mode 100644
index 0000000..2152bab
--- /dev/null
+++ b/riscv/insns/vsadd_vv.h
@@ -0,0 +1,28 @@
+// vsadd.vv vd, vs2, vs1
+VI_LOOP_BASE
+bool sat = false;
+switch(sew) {
+case e8: {
+  VV_PARAMS(e8);
+  vd = sat_add<int8_t, uint8_t>(vs2, vs1, sat);
+  break;
+}
+case e16: {
+  VV_PARAMS(e16);
+  vd = sat_add<int16_t, uint16_t>(vs2, vs1, sat);
+  break;
+}
+case e32: {
+  VV_PARAMS(e32);
+  vd = sat_add<int32_t, uint32_t>(vs2, vs1, sat);
+  break;
+}
+default: {
+  VV_PARAMS(e64);
+  vd = sat_add<int64_t, uint64_t>(vs2, vs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
+
diff --git a/riscv/insns/vsadd_vx.h b/riscv/insns/vsadd_vx.h
new file mode 100644
index 0000000..781e9e8
--- /dev/null
+++ b/riscv/insns/vsadd_vx.h
@@ -0,0 +1,27 @@
+// vsadd.vx vd, vs2, rs1
+VI_LOOP_BASE
+bool sat = false;
+switch(sew) {
+case e8: {
+  VX_PARAMS(e8);
+  vd = sat_add<int8_t, uint8_t>(vs2, rs1, sat);
+  break;
+}
+case e16: {
+  VX_PARAMS(e16);
+  vd = sat_add<int16_t, uint16_t>(vs2, rs1, sat);
+  break;
+}
+case e32: {
+  VX_PARAMS(e32);
+  vd = sat_add<int32_t, uint32_t>(vs2, rs1, sat);
+  break;
+}
+default: {
+  VX_PARAMS(e64);
+  vd = sat_add<int64_t, uint64_t>(vs2, rs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vsaddu_vi.h b/riscv/insns/vsaddu_vi.h
new file mode 100644
index 0000000..9d376cc
--- /dev/null
+++ b/riscv/insns/vsaddu_vi.h
@@ -0,0 +1,11 @@
+// vsaddu vd, vs2, zimm5
+VI_VI_ULOOP
+({
+  bool sat = false;
+  vd = vs2 + simm5;
+
+  sat = vd < vs2;
+  vd |= -(vd < vs2);
+
+  P.VU.vxsat |= sat;
+})
diff --git a/riscv/insns/vsaddu_vv.h b/riscv/insns/vsaddu_vv.h
new file mode 100644
index 0000000..e5d7025
--- /dev/null
+++ b/riscv/insns/vsaddu_vv.h
@@ -0,0 +1,11 @@
+// vsaddu vd, vs2, vs1
+VI_VV_ULOOP
+({
+  bool sat = false;
+  vd = vs2 + vs1;
+
+  sat = vd < vs2;
+  vd |= -(vd < vs2);
+
+  P.VU.vxsat |= sat;
+})
diff --git a/riscv/insns/vsaddu_vx.h b/riscv/insns/vsaddu_vx.h
new file mode 100644
index 0000000..46ec29d
--- /dev/null
+++ b/riscv/insns/vsaddu_vx.h
@@ -0,0 +1,12 @@
+// vsaddu vd, vs2, rs1
+VI_VX_ULOOP
+({
+  bool sat = false;
+  vd = vs2 + rs1;
+
+  sat = vd < vs2;
+  vd |= -(vd < vs2);
+
+  P.VU.vxsat |= sat;
+
+})
diff --git a/riscv/insns/vsbc_vvm.h b/riscv/insns/vsbc_vvm.h
new file mode 100644
index 0000000..4cd58ba
--- /dev/null
+++ b/riscv/insns/vsbc_vvm.h
@@ -0,0 +1,11 @@
+// vsbc.vvm vd, vs2, rs1
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_VV_LOOP
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & vs1) - (op_mask & vs2) - carry;
+  vd = res;
+})
diff --git a/riscv/insns/vsbc_vxm.h b/riscv/insns/vsbc_vxm.h
new file mode 100644
index 0000000..12551b8
--- /dev/null
+++ b/riscv/insns/vsbc_vxm.h
@@ -0,0 +1,11 @@
+// vsbc.vxm vd, vs2, rs1
+require(!(insn.rd() == 0 && P.VU.vlmul > 1));
+VI_VX_ULOOP
+({
+  auto &v0 = P.VU.elt<uint64_t>(0, midx);
+  const uint128_t op_mask = (UINT64_MAX >> (64 - sew));
+  uint64_t carry = (v0 >> mpos) & 0x1;
+
+  uint128_t res = (op_mask & rs1) - (op_mask & vs2) - carry;
+  vd = res;
+})
diff --git a/riscv/insns/vslide1down_vx.h b/riscv/insns/vslide1down_vx.h
new file mode 100644
index 0000000..0069df7
--- /dev/null
+++ b/riscv/insns/vslide1down_vx.h
@@ -0,0 +1,42 @@
+//vslide1down.vx vd, vs2, rs1
+VI_LOOP_BASE
+if (i != vl - 1) {
+  switch (sew) {
+  case e8: {
+    VI_XI_SLIDEDOWN_PARAMS(e8, 1);
+    vd = vs2;
+  }
+  break;
+  case e16: {
+    VI_XI_SLIDEDOWN_PARAMS(e16, 1);
+    vd = vs2;
+  }
+  break;
+  case e32: {
+    VI_XI_SLIDEDOWN_PARAMS(e32, 1);
+    vd = vs2;
+  }
+  break;
+  default: {
+    VI_XI_SLIDEDOWN_PARAMS(e64, 1);
+    vd = vs2;
+  }
+  break;
+  }
+} else {
+  switch (sew) {
+  case e8:
+    P.VU.elt<uint8_t>(rd_num, vl - 1) = RS1;
+    break;
+  case e16:
+    P.VU.elt<uint16_t>(rd_num, vl - 1) = RS1;
+    break;
+  case e32:
+    P.VU.elt<uint32_t>(rd_num, vl - 1) = RS1;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, vl - 1) = RS1;
+    break;
+  }
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslide1up_vx.h b/riscv/insns/vslide1up_vx.h
new file mode 100644
index 0000000..50cc503
--- /dev/null
+++ b/riscv/insns/vslide1up_vx.h
@@ -0,0 +1,32 @@
+//vslide1up.vx vd, vs2, rs1
+if (insn.v_vm() == 0)
+  require(insn.rd() != 0);
+
+VI_CHECK_SS
+VI_LOOP_BASE
+if (i != 0) {
+  if (sew == e8) {
+    VI_XI_SLIDEUP_PARAMS(e8, 1);
+    vd = vs2;
+  } else if(sew == e16) {
+    VI_XI_SLIDEUP_PARAMS(e16, 1);
+    vd = vs2;
+  } else if(sew == e32) {
+    VI_XI_SLIDEUP_PARAMS(e32, 1);
+    vd = vs2;
+  } else if(sew == e64) {
+    VI_XI_SLIDEUP_PARAMS(e64, 1);
+    vd = vs2;
+  }
+} else {
+  if (sew == e8) {
+    P.VU.elt<uint8_t>(rd_num, 0) = RS1;
+  } else if(sew == e16) {
+    P.VU.elt<uint16_t>(rd_num, 0) = RS1;
+  } else if(sew == e32) {
+    P.VU.elt<uint32_t>(rd_num, 0) = RS1;
+  } else if(sew == e64) {
+    P.VU.elt<uint64_t>(rd_num, 0) = RS1;
+  }
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslidedown_vi.h b/riscv/insns/vslidedown_vi.h
new file mode 100644
index 0000000..c21c5f2
--- /dev/null
+++ b/riscv/insns/vslidedown_vi.h
@@ -0,0 +1,33 @@
+// vslidedown.vi vd, vs2, rs1
+VI_LOOP_BASE
+const reg_t sh = insn.v_zimm5();
+bool is_valid = (i + sh) < P.VU.vlmax;
+reg_t offset = 0;
+
+if (is_valid) {
+  offset = sh;
+}
+
+switch (sew) {
+case e8: {
+  VI_XI_SLIDEDOWN_PARAMS(e8, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+case e16: {
+  VI_XI_SLIDEDOWN_PARAMS(e16, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+case e32: {
+  VI_XI_SLIDEDOWN_PARAMS(e32, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+default: {
+  VI_XI_SLIDEDOWN_PARAMS(e64, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslidedown_vx.h b/riscv/insns/vslidedown_vx.h
new file mode 100644
index 0000000..251740c
--- /dev/null
+++ b/riscv/insns/vslidedown_vx.h
@@ -0,0 +1,33 @@
+//vslidedown.vx vd, vs2, rs1
+VI_LOOP_BASE
+
+reg_t offset = RS1 == (reg_t)-1 ? ((RS1 & (P.VU.vlmax * 2 - 1)) + i) : RS1;
+bool is_valid = offset < P.VU.vlmax;
+
+if (!is_valid) {
+  offset = 0;
+}
+
+switch (sew) {
+case e8: {
+  VI_XI_SLIDEDOWN_PARAMS(e8, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+case e16: {
+  VI_XI_SLIDEDOWN_PARAMS(e16, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+case e32: {
+  VI_XI_SLIDEDOWN_PARAMS(e32, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+default: {
+  VI_XI_SLIDEDOWN_PARAMS(e64, offset);
+  vd = is_valid ? vs2 : 0;
+}
+break;
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslideup_vi.h b/riscv/insns/vslideup_vi.h
new file mode 100644
index 0000000..4135b20
--- /dev/null
+++ b/riscv/insns/vslideup_vi.h
@@ -0,0 +1,33 @@
+// vslideup.vi vd, vs2, rs1
+if (insn.v_vm() == 0)
+  require(insn.rd() != 0);
+
+VI_CHECK_SS
+const reg_t offset = insn.v_zimm5();
+VI_LOOP_BASE
+if (P.VU.vstart < offset && i < offset)
+  continue;
+
+switch (sew) {
+case e8: {
+  VI_XI_SLIDEUP_PARAMS(e8, offset);
+  vd = vs2;
+}
+break;
+case e16: {
+  VI_XI_SLIDEUP_PARAMS(e16, offset);
+  vd = vs2;
+}
+break;
+case e32: {
+  VI_XI_SLIDEUP_PARAMS(e32, offset);
+  vd = vs2;
+}
+break;
+default: {
+  VI_XI_SLIDEUP_PARAMS(e64, offset);
+  vd = vs2;
+}
+break;
+}
+VI_LOOP_END
diff --git a/riscv/insns/vslideup_vx.h b/riscv/insns/vslideup_vx.h
new file mode 100644
index 0000000..bf73fcd
--- /dev/null
+++ b/riscv/insns/vslideup_vx.h
@@ -0,0 +1,29 @@
+//vslideup.vx vd, vs2, rs1
+const reg_t offset = RS1;
+VI_LOOP_BASE
+if (P.VU.vstart < offset && i < offset)
+  continue;
+
+switch (sew) {
+case e8: {
+  VI_XI_SLIDEUP_PARAMS(e8, offset);
+  vd = vs2;
+}
+break;
+case e16: {
+  VI_XI_SLIDEUP_PARAMS(e16, offset);
+  vd = vs2;
+}
+break;
+case e32: {
+  VI_XI_SLIDEUP_PARAMS(e32, offset);
+  vd = vs2;
+}
+break;
+default: {
+  VI_XI_SLIDEUP_PARAMS(e64, offset);
+  vd = vs2;
+}
+break;
+}
+VI_LOOP_END
diff --git a/riscv/insns/vsll_vi.h b/riscv/insns/vsll_vi.h
new file mode 100644
index 0000000..be46506
--- /dev/null
+++ b/riscv/insns/vsll_vi.h
@@ -0,0 +1,5 @@
+// vsll.vi  vd, vs2, zimm5
+VI_VI_LOOP
+({
+  vd = vs2 << (simm5 & (sew - 1) & 0x1f);
+})
diff --git a/riscv/insns/vsll_vv.h b/riscv/insns/vsll_vv.h
new file mode 100644
index 0000000..ce82022
--- /dev/null
+++ b/riscv/insns/vsll_vv.h
@@ -0,0 +1,5 @@
+// vsll
+VI_VV_LOOP
+({
+  vd = vs2 << (vs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsll_vx.h b/riscv/insns/vsll_vx.h
new file mode 100644
index 0000000..823510b
--- /dev/null
+++ b/riscv/insns/vsll_vx.h
@@ -0,0 +1,5 @@
+// vsll
+VI_VX_LOOP
+({
+  vd = vs2 << (rs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsmul_vv.h b/riscv/insns/vsmul_vv.h
new file mode 100644
index 0000000..70d22ae
--- /dev/null
+++ b/riscv/insns/vsmul_vv.h
@@ -0,0 +1,33 @@
+// vsmul: Signed saturating and rounding fractional multiply
+VRM xrm = P.VU.get_vround_mode();
+uint64_t int_max = (1ul << (P.VU.vsew - 1)) - 1;
+uint64_t int_min = - (1 << (P.VU.vsew - 1));
+uint64_t sign_mask = ((1ul << (P.VU.vsew - 1)));
+
+VI_VV_ULOOP
+({
+  uint64_t vs1_sign;
+  uint64_t vs2_sign;
+  uint64_t result_sign;
+
+  vs1_sign = vs1 & sign_mask;
+  vs2_sign = vs2 & sign_mask;
+  bool overflow = vs1 == vs2 && vs1 == int_min;
+
+  uint128_t result = (uint128_t)vs1 * (uint128_t)vs2;
+  result &= ((uint128_t)1llu << ((sew * 2) - 2)) - 1;
+  result_sign = (vs1_sign ^ vs2_sign) & sign_mask;
+  // rounding
+  INT_ROUNDING(result, xrm, sew - 1);
+  // unsigned shifting
+  result = result >> (sew - 1);
+
+  // saturation
+  if (overflow) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  } else {
+    result |= result_sign;
+  }
+  vd = result;
+})
diff --git a/riscv/insns/vsmul_vx.h b/riscv/insns/vsmul_vx.h
new file mode 100644
index 0000000..ef3751b
--- /dev/null
+++ b/riscv/insns/vsmul_vx.h
@@ -0,0 +1,34 @@
+// vsmul
+VRM xrm = P.VU.get_vround_mode();
+uint128_t int_max = (1ul << (P.VU.vsew - 1)) - 1;
+uint128_t int_min = - (1 << (P.VU.vsew - 1));
+uint128_t sign_mask = ((1ul << (P.VU.vsew - 1)));
+
+VI_VX_ULOOP
+({
+  uint128_t rs1_sign;
+  uint128_t vs2_sign;
+  uint128_t result_sign;
+
+  rs1_sign = rs1 & sign_mask;
+  vs2_sign = vs2 & sign_mask;
+  bool overflow = rs1 == vs2 && rs1 == int_min;
+
+  uint128_t result = (uint128_t)rs1 * (uint128_t)vs2;
+  result &= ((uint128_t)1llu << ((sew * 2) - 2)) - 1;
+  result_sign = (rs1_sign ^ vs2_sign) & sign_mask;
+  // rounding
+  INT_ROUNDING(result, xrm, sew - 1);
+
+  // unsigned shifting
+  result = result >> (sew - 1);
+
+  // saturation
+  if (overflow) {
+    result = int_max;
+    P.VU.vxsat = 1;
+  } else {
+    result |= result_sign;
+  }
+  vd = result;
+})
diff --git a/riscv/insns/vsra_vi.h b/riscv/insns/vsra_vi.h
new file mode 100644
index 0000000..5c58927
--- /dev/null
+++ b/riscv/insns/vsra_vi.h
@@ -0,0 +1,5 @@
+// vsra.vi vd, vs2, zimm5
+VI_VI_LOOP
+({
+  vd = vs2 >> (simm5 & (sew - 1) & 0x1f);
+})
diff --git a/riscv/insns/vsra_vv.h b/riscv/insns/vsra_vv.h
new file mode 100644
index 0000000..8889af9
--- /dev/null
+++ b/riscv/insns/vsra_vv.h
@@ -0,0 +1,5 @@
+// vsra.vv  vd, vs2, vs1
+VI_VV_LOOP
+({
+  vd = vs2 >> (vs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsra_vx.h b/riscv/insns/vsra_vx.h
new file mode 100644
index 0000000..c1b0c10
--- /dev/null
+++ b/riscv/insns/vsra_vx.h
@@ -0,0 +1,5 @@
+// vsra.vx vd, vs2, rs1
+VI_VX_LOOP
+({
+  vd = vs2 >> (rs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsrl_vi.h b/riscv/insns/vsrl_vi.h
new file mode 100644
index 0000000..5006854
--- /dev/null
+++ b/riscv/insns/vsrl_vi.h
@@ -0,0 +1,5 @@
+// vsrl.vi vd, vs2, zimm5
+VI_VI_ULOOP
+({
+  vd = vs2 >> (simm5 & (sew - 1) & 0x1f);
+})
diff --git a/riscv/insns/vsrl_vv.h b/riscv/insns/vsrl_vv.h
new file mode 100644
index 0000000..6376af3
--- /dev/null
+++ b/riscv/insns/vsrl_vv.h
@@ -0,0 +1,5 @@
+// vsrl.vv  vd, vs2, vs1
+VI_VV_ULOOP
+({
+  vd = vs2 >> (vs1 & (sew - 1));
+})
diff --git a/riscv/insns/vsrl_vx.h b/riscv/insns/vsrl_vx.h
new file mode 100644
index 0000000..a4f899c
--- /dev/null
+++ b/riscv/insns/vsrl_vx.h
@@ -0,0 +1,5 @@
+// vsrl.vx vd, vs2, rs1
+VI_VX_ULOOP
+({
+  vd = vs2 >> (rs1 & (sew - 1));
+})
diff --git a/riscv/insns/vssra_vi.h b/riscv/insns/vssra_vi.h
new file mode 100644
index 0000000..ef2390c
--- /dev/null
+++ b/riscv/insns/vssra_vi.h
@@ -0,0 +1,8 @@
+// vssra.vi vd, vs2, simm5
+VRM xrm = P.VU.get_vround_mode();
+VI_VI_LOOP
+({
+  int sh = simm5 & (sew - 1) & 0x1f;
+  INT_ROUNDING(vs2, xrm, sh);
+  vd = vs2 >> sh;
+})
diff --git a/riscv/insns/vssra_vv.h b/riscv/insns/vssra_vv.h
new file mode 100644
index 0000000..e697b52
--- /dev/null
+++ b/riscv/insns/vssra_vv.h
@@ -0,0 +1,9 @@
+// vssra.vv vd, vs2, vs1
+VRM xrm = P.VU.get_vround_mode();
+VI_VV_LOOP
+({
+  int sh = vs1 & (sew - 1);
+
+  INT_ROUNDING(vs2, xrm, sh);
+  vd = vs2 >> sh;
+})
diff --git a/riscv/insns/vssra_vx.h b/riscv/insns/vssra_vx.h
new file mode 100644
index 0000000..8d7ad20
--- /dev/null
+++ b/riscv/insns/vssra_vx.h
@@ -0,0 +1,9 @@
+// vssra.vx vd, vs2, rs1
+VRM xrm = P.VU.get_vround_mode();
+VI_VX_LOOP
+({
+  int sh = rs1 & (sew - 1);
+
+  INT_ROUNDING(vs2, xrm, sh);
+  vd = vs2 >> sh;
+})
diff --git a/riscv/insns/vssrl_vi.h b/riscv/insns/vssrl_vi.h
new file mode 100644
index 0000000..8a10df0
--- /dev/null
+++ b/riscv/insns/vssrl_vi.h
@@ -0,0 +1,9 @@
+// vssra.vi vd, vs2, simm5
+VRM xrm = P.VU.get_vround_mode();
+VI_VI_ULOOP
+({
+  int sh = simm5 & (sew - 1) & 0x1f;
+
+  INT_ROUNDING(vs2, xrm, sh);
+  vd = vs2 >> sh;
+})
diff --git a/riscv/insns/vssrl_vv.h b/riscv/insns/vssrl_vv.h
new file mode 100644
index 0000000..f40cd90
--- /dev/null
+++ b/riscv/insns/vssrl_vv.h
@@ -0,0 +1,9 @@
+// vssrl.vv vd, vs2, vs1
+VRM xrm = P.VU.get_vround_mode();
+VI_VV_ULOOP
+({
+  int sh = vs1 & (sew - 1);
+
+  INT_ROUNDING(vs2, xrm, sh);
+  vd = vs2 >> sh;
+})
diff --git a/riscv/insns/vssrl_vx.h b/riscv/insns/vssrl_vx.h
new file mode 100644
index 0000000..5da3f75
--- /dev/null
+++ b/riscv/insns/vssrl_vx.h
@@ -0,0 +1,9 @@
+// vssrl.vx vd, vs2, rs1
+VRM xrm = P.VU.get_vround_mode();
+VI_VX_ULOOP
+({
+  int sh = rs1 & (sew - 1);
+
+  INT_ROUNDING(vs2, xrm, sh);
+  vd = vs2 >> sh;
+})
diff --git a/riscv/insns/vssub_vv.h b/riscv/insns/vssub_vv.h
new file mode 100644
index 0000000..fd3ee21
--- /dev/null
+++ b/riscv/insns/vssub_vv.h
@@ -0,0 +1,28 @@
+// vssub.vv vd, vs2, vs1
+VI_LOOP_BASE
+bool sat = false;
+
+switch (sew) {
+case e8: {
+  VV_PARAMS(e8);
+  vd = sat_sub<int8_t, uint8_t>(vs2, vs1, sat);
+  break;
+}
+case e16: {
+  VV_PARAMS(e16);
+  vd = sat_sub<int16_t, uint16_t>(vs2, vs1, sat);
+  break;
+}
+case e32: {
+  VV_PARAMS(e32);
+  vd = sat_sub<int32_t, uint32_t>(vs2, vs1, sat);
+  break;
+}
+default: {
+  VV_PARAMS(e64);
+  vd = sat_sub<int64_t, uint64_t>(vs2, vs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vssub_vx.h b/riscv/insns/vssub_vx.h
new file mode 100644
index 0000000..5c5c781
--- /dev/null
+++ b/riscv/insns/vssub_vx.h
@@ -0,0 +1,28 @@
+// vssub.vx vd, vs2, rs1
+VI_LOOP_BASE
+bool sat = false;
+
+switch (sew) {
+case e8: {
+  VX_PARAMS(e8);
+  vd = sat_sub<int8_t, uint8_t>(vs2, rs1, sat);
+  break;
+}
+case e16: {
+  VX_PARAMS(e16);
+  vd = sat_sub<int16_t, uint16_t>(vs2, rs1, sat);
+  break;
+}
+case e32: {
+  VX_PARAMS(e32);
+  vd = sat_sub<int32_t, uint32_t>(vs2, rs1, sat);
+  break;
+}
+default: {
+  VX_PARAMS(e64);
+  vd = sat_sub<int64_t, uint64_t>(vs2, rs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vssubu_vv.h b/riscv/insns/vssubu_vv.h
new file mode 100644
index 0000000..c5c74fe
--- /dev/null
+++ b/riscv/insns/vssubu_vv.h
@@ -0,0 +1,29 @@
+// vssubu.vv vd, vs2, vs1
+VI_LOOP_BASE
+bool sat = false;
+
+switch (sew) {
+case e8: {
+  VV_U_PARAMS(e8);
+  vd = sat_subu<uint8_t>(vs2, vs1, sat);
+  break;
+}
+case e16: {
+  VV_U_PARAMS(e16);
+  vd = sat_subu<uint16_t>(vs2, vs1, sat);
+  break;
+}
+case e32: {
+  VV_U_PARAMS(e32);
+  vd = sat_subu<uint32_t>(vs2, vs1, sat);
+  break;
+}
+default: {
+  VV_U_PARAMS(e64);
+  vd = sat_subu<uint64_t>(vs2, vs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+
+VI_LOOP_END
diff --git a/riscv/insns/vssubu_vx.h b/riscv/insns/vssubu_vx.h
new file mode 100644
index 0000000..12cfdbb
--- /dev/null
+++ b/riscv/insns/vssubu_vx.h
@@ -0,0 +1,28 @@
+// vssubu.vx vd, vs2, rs1
+VI_LOOP_BASE
+bool sat = false;
+
+switch (sew) {
+case e8: {
+  VX_U_PARAMS(e8);
+  vd = sat_subu<uint8_t>(vs2, rs1, sat);
+  break;
+}
+case e16: {
+  VX_U_PARAMS(e16);
+  vd = sat_subu<uint16_t>(vs2, rs1, sat);
+  break;
+}
+case e32: {
+  VX_U_PARAMS(e32);
+  vd = sat_subu<uint32_t>(vs2, rs1, sat);
+  break;
+}
+default: {
+  VX_U_PARAMS(e64);
+  vd = sat_subu<uint64_t>(vs2, rs1, sat);
+  break;
+}
+}
+P.VU.vxsat |= sat;
+VI_LOOP_END
diff --git a/riscv/insns/vsub_vv.h b/riscv/insns/vsub_vv.h
new file mode 100644
index 0000000..7d119d5
--- /dev/null
+++ b/riscv/insns/vsub_vv.h
@@ -0,0 +1,5 @@
+// vsub
+VI_VV_LOOP
+({
+  vd = vs2 - vs1;
+})
diff --git a/riscv/insns/vsub_vx.h b/riscv/insns/vsub_vx.h
new file mode 100644
index 0000000..e075b42
--- /dev/null
+++ b/riscv/insns/vsub_vx.h
@@ -0,0 +1,5 @@
+// vsub: vd[i] = (vd[i] * x[rs1]) - vs2[i]
+VI_VX_LOOP
+({
+  vd = vs2 - rs1;
+})
diff --git a/riscv/insns/vwadd_vv.h b/riscv/insns/vwadd_vv.h
new file mode 100644
index 0000000..df4a135
--- /dev/null
+++ b/riscv/insns/vwadd_vv.h
@@ -0,0 +1,6 @@
+// vwadd.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, +, +, int);
+})
diff --git a/riscv/insns/vwadd_vx.h b/riscv/insns/vwadd_vx.h
new file mode 100644
index 0000000..c226389
--- /dev/null
+++ b/riscv/insns/vwadd_vx.h
@@ -0,0 +1,6 @@
+// vwadd.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, +, +, int);
+})
diff --git a/riscv/insns/vwadd_wv.h b/riscv/insns/vwadd_wv.h
new file mode 100644
index 0000000..54d2ba4
--- /dev/null
+++ b/riscv/insns/vwadd_wv.h
@@ -0,0 +1,6 @@
+// vwadd.wv vd, vs2, vs1
+VI_CHECK_DDS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(vs1, +, int);
+})
diff --git a/riscv/insns/vwadd_wx.h b/riscv/insns/vwadd_wx.h
new file mode 100644
index 0000000..bb4cee5
--- /dev/null
+++ b/riscv/insns/vwadd_wx.h
@@ -0,0 +1,6 @@
+// vwaddu.wx vd, vs2, rs1
+VI_CHECK_DDS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(rs1, +, int);
+})
diff --git a/riscv/insns/vwaddu_vv.h b/riscv/insns/vwaddu_vv.h
new file mode 100644
index 0000000..286ebc8
--- /dev/null
+++ b/riscv/insns/vwaddu_vv.h
@@ -0,0 +1,6 @@
+// vwaddu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, +, +, uint);
+})
diff --git a/riscv/insns/vwaddu_vx.h b/riscv/insns/vwaddu_vx.h
new file mode 100644
index 0000000..61cddfc
--- /dev/null
+++ b/riscv/insns/vwaddu_vx.h
@@ -0,0 +1,6 @@
+// vwaddu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, +, +, uint);
+})
diff --git a/riscv/insns/vwaddu_wv.h b/riscv/insns/vwaddu_wv.h
new file mode 100644
index 0000000..fee8136
--- /dev/null
+++ b/riscv/insns/vwaddu_wv.h
@@ -0,0 +1,6 @@
+// vwaddu.wv vd, vs2, vs1
+VI_CHECK_DDS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(vs1, +, uint);
+})
diff --git a/riscv/insns/vwaddu_wx.h b/riscv/insns/vwaddu_wx.h
new file mode 100644
index 0000000..0073ac3
--- /dev/null
+++ b/riscv/insns/vwaddu_wx.h
@@ -0,0 +1,6 @@
+// vwaddu.wx vd, vs2, rs1
+VI_CHECK_DDS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(rs1, +, uint);
+})
diff --git a/riscv/insns/vwmacc_vv.h b/riscv/insns/vwmacc_vv.h
new file mode 100644
index 0000000..7208c6d
--- /dev/null
+++ b/riscv/insns/vwmacc_vv.h
@@ -0,0 +1,6 @@
+// vwmacc.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, vd_w, *, +, int);
+})
diff --git a/riscv/insns/vwmacc_vx.h b/riscv/insns/vwmacc_vx.h
new file mode 100644
index 0000000..5ae597a
--- /dev/null
+++ b/riscv/insns/vwmacc_vx.h
@@ -0,0 +1,6 @@
+// vwmacc.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, vd_w, *, +, int);
+})
diff --git a/riscv/insns/vwmaccsu_vv.h b/riscv/insns/vwmaccsu_vv.h
new file mode 100644
index 0000000..3aa43ef
--- /dev/null
+++ b/riscv/insns/vwmaccsu_vv.h
@@ -0,0 +1,6 @@
+// vwmaccsu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN_MIX(vs2, vs1, vd_w, *, +, int, uint, int);
+})
diff --git a/riscv/insns/vwmaccsu_vx.h b/riscv/insns/vwmaccsu_vx.h
new file mode 100644
index 0000000..e00a21d
--- /dev/null
+++ b/riscv/insns/vwmaccsu_vx.h
@@ -0,0 +1,6 @@
+// vwmaccsu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN_MIX(vs2, rs1, vd_w, *, +, int, uint, int);
+})
diff --git a/riscv/insns/vwmaccu_vv.h b/riscv/insns/vwmaccu_vv.h
new file mode 100644
index 0000000..2cbdaa3
--- /dev/null
+++ b/riscv/insns/vwmaccu_vv.h
@@ -0,0 +1,6 @@
+// vwmaccu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, vd_w, *, +, uint);
+})
diff --git a/riscv/insns/vwmaccu_vx.h b/riscv/insns/vwmaccu_vx.h
new file mode 100644
index 0000000..533297f
--- /dev/null
+++ b/riscv/insns/vwmaccu_vx.h
@@ -0,0 +1,6 @@
+// vwmaccu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, vd_w, *, +, uint);
+})
diff --git a/riscv/insns/vwmaccus_vx.h b/riscv/insns/vwmaccus_vx.h
new file mode 100644
index 0000000..5310f0e
--- /dev/null
+++ b/riscv/insns/vwmaccus_vx.h
@@ -0,0 +1,6 @@
+// vwmaccus.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN_MIX(vs2, rs1, vd_w, *, +, int, int, uint);
+})
diff --git a/riscv/insns/vwmul_vv.h b/riscv/insns/vwmul_vv.h
new file mode 100644
index 0000000..2197edb
--- /dev/null
+++ b/riscv/insns/vwmul_vv.h
@@ -0,0 +1,6 @@
+// vwmul.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, *, +, int);
+})
diff --git a/riscv/insns/vwmul_vx.h b/riscv/insns/vwmul_vx.h
new file mode 100644
index 0000000..bc1422d
--- /dev/null
+++ b/riscv/insns/vwmul_vx.h
@@ -0,0 +1,6 @@
+// vwmul.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, *, +, int);
+})
diff --git a/riscv/insns/vwmulsu_vv.h b/riscv/insns/vwmulsu_vv.h
new file mode 100644
index 0000000..9786adb
--- /dev/null
+++ b/riscv/insns/vwmulsu_vv.h
@@ -0,0 +1,16 @@
+// vwmulsu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  switch(P.VU.vsew) {
+  case e8:
+    P.VU.elt<uint16_t>(rd_num, i) = (int16_t)(int8_t)vs2 * (int16_t)(uint8_t)vs1;
+    break;
+  case e16:
+    P.VU.elt<uint32_t>(rd_num, i) = (int32_t)(int16_t)vs2 * (int32_t)(uint16_t)vs1;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i) = (int64_t)(int32_t)vs2 * (int64_t)(uint32_t)vs1;
+    break;
+  }
+})
diff --git a/riscv/insns/vwmulsu_vx.h b/riscv/insns/vwmulsu_vx.h
new file mode 100644
index 0000000..feb1fd1
--- /dev/null
+++ b/riscv/insns/vwmulsu_vx.h
@@ -0,0 +1,16 @@
+// vwmulsu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  switch(P.VU.vsew) {
+  case e8:
+    P.VU.elt<uint16_t>(rd_num, i) = (int16_t)(int8_t)vs2 * (int16_t)(uint8_t)rs1;
+    break;
+  case e16:
+    P.VU.elt<uint32_t>(rd_num, i) = (int32_t)(int16_t)vs2 * (int32_t)(uint16_t)rs1;
+    break;
+  default:
+    P.VU.elt<uint64_t>(rd_num, i) = (int64_t)(int32_t)vs2 * (int64_t)(uint32_t)rs1;
+    break;
+  }
+})
diff --git a/riscv/insns/vwmulu_vv.h b/riscv/insns/vwmulu_vv.h
new file mode 100644
index 0000000..8ddbb4b
--- /dev/null
+++ b/riscv/insns/vwmulu_vv.h
@@ -0,0 +1,6 @@
+// vwmulu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, *, +, uint);
+})
diff --git a/riscv/insns/vwmulu_vx.h b/riscv/insns/vwmulu_vx.h
new file mode 100644
index 0000000..1ce77ee
--- /dev/null
+++ b/riscv/insns/vwmulu_vx.h
@@ -0,0 +1,6 @@
+// vwmul.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, *, +, uint);
+})
diff --git a/riscv/insns/vwredsum_vs.h b/riscv/insns/vwredsum_vs.h
new file mode 100644
index 0000000..c7a87db
--- /dev/null
+++ b/riscv/insns/vwredsum_vs.h
@@ -0,0 +1,5 @@
+// vwredsum.vs vd, vs2, vs1
+VI_VV_LOOP_WIDE_REDUCTION
+({
+  vd_0_res += vs2;
+})
diff --git a/riscv/insns/vwredsumu_vs.h b/riscv/insns/vwredsumu_vs.h
new file mode 100644
index 0000000..889a77d
--- /dev/null
+++ b/riscv/insns/vwredsumu_vs.h
@@ -0,0 +1,5 @@
+// vwredsum.vs vd, vs2, vs1
+VI_VV_ULOOP_WIDE_REDUCTION
+({
+  vd_0_res += vs2;
+})
diff --git a/riscv/insns/vwsmacc_vv.h b/riscv/insns/vwsmacc_vv.h
new file mode 100644
index 0000000..86d588d
--- /dev/null
+++ b/riscv/insns/vwsmacc_vv.h
@@ -0,0 +1,2 @@
+// vwsmacc.vv vd, vs2, vs1
+VI_VVX_LOOP_WIDE_SSMA(vs1);
diff --git a/riscv/insns/vwsmacc_vx.h b/riscv/insns/vwsmacc_vx.h
new file mode 100644
index 0000000..f0f04a3
--- /dev/null
+++ b/riscv/insns/vwsmacc_vx.h
@@ -0,0 +1,2 @@
+// vwsmacc.vx vd, vs2, rs1
+VI_VVX_LOOP_WIDE_SSMA(rs1);
diff --git a/riscv/insns/vwsmaccsu_vv.h b/riscv/insns/vwsmaccsu_vv.h
new file mode 100644
index 0000000..cf1aa1e
--- /dev/null
+++ b/riscv/insns/vwsmaccsu_vv.h
@@ -0,0 +1,2 @@
+// vwsmaccsu.vx vd, vs2, vs1
+VI_VVX_LOOP_WIDE_SU_SSMA(vs1);
diff --git a/riscv/insns/vwsmaccsu_vx.h b/riscv/insns/vwsmaccsu_vx.h
new file mode 100644
index 0000000..681c309
--- /dev/null
+++ b/riscv/insns/vwsmaccsu_vx.h
@@ -0,0 +1,2 @@
+// vwsmaccsu.vx vd, vs2, rs1
+VI_VVX_LOOP_WIDE_SU_SSMA(rs1);
diff --git a/riscv/insns/vwsmaccu_vv.h b/riscv/insns/vwsmaccu_vv.h
new file mode 100644
index 0000000..e873d93
--- /dev/null
+++ b/riscv/insns/vwsmaccu_vv.h
@@ -0,0 +1,2 @@
+// vwsmaccu.vv vd, vs2, vs1
+VI_VVX_LOOP_WIDE_USSMA(vs1);
diff --git a/riscv/insns/vwsmaccu_vx.h b/riscv/insns/vwsmaccu_vx.h
new file mode 100644
index 0000000..7318fa7
--- /dev/null
+++ b/riscv/insns/vwsmaccu_vx.h
@@ -0,0 +1,2 @@
+// vwsmaccu vd, vs2, rs1
+VI_VVX_LOOP_WIDE_USSMA(rs1);
diff --git a/riscv/insns/vwsmaccus_vx.h b/riscv/insns/vwsmaccus_vx.h
new file mode 100644
index 0000000..da1a1c8
--- /dev/null
+++ b/riscv/insns/vwsmaccus_vx.h
@@ -0,0 +1,2 @@
+// vwsmaccus.vx  vd, vs2, rs1
+VI_VVX_LOOP_WIDE_US_SSMA(rs1);
diff --git a/riscv/insns/vwsub_vv.h b/riscv/insns/vwsub_vv.h
new file mode 100644
index 0000000..99f9348
--- /dev/null
+++ b/riscv/insns/vwsub_vv.h
@@ -0,0 +1,6 @@
+// vwsub.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, -, +, int);
+})
diff --git a/riscv/insns/vwsub_vx.h b/riscv/insns/vwsub_vx.h
new file mode 100644
index 0000000..affdf62
--- /dev/null
+++ b/riscv/insns/vwsub_vx.h
@@ -0,0 +1,6 @@
+// vwsub.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, -, +, int);
+})
diff --git a/riscv/insns/vwsub_wv.h b/riscv/insns/vwsub_wv.h
new file mode 100644
index 0000000..10db730
--- /dev/null
+++ b/riscv/insns/vwsub_wv.h
@@ -0,0 +1,6 @@
+// vwsub.wv vd, vs2, vs1
+VI_CHECK_DDS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(vs1, -, int);
+})
diff --git a/riscv/insns/vwsub_wx.h b/riscv/insns/vwsub_wx.h
new file mode 100644
index 0000000..f72341b
--- /dev/null
+++ b/riscv/insns/vwsub_wx.h
@@ -0,0 +1,6 @@
+// vwsub.wx vd, vs2, rs1
+VI_CHECK_DDS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(rs1, -, int);
+})
diff --git a/riscv/insns/vwsubu_vv.h b/riscv/insns/vwsubu_vv.h
new file mode 100644
index 0000000..cf68adb
--- /dev/null
+++ b/riscv/insns/vwsubu_vv.h
@@ -0,0 +1,6 @@
+// vwsubu.vv vd, vs2, vs1
+VI_CHECK_DSS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, vs1, 0, -, +, uint);
+})
diff --git a/riscv/insns/vwsubu_vx.h b/riscv/insns/vwsubu_vx.h
new file mode 100644
index 0000000..3e972dd
--- /dev/null
+++ b/riscv/insns/vwsubu_vx.h
@@ -0,0 +1,6 @@
+// vwsubu.vx vd, vs2, rs1
+VI_CHECK_DSS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_OP_AND_ASSIGN(vs2, rs1, 0, -, +, uint);
+})
diff --git a/riscv/insns/vwsubu_wv.h b/riscv/insns/vwsubu_wv.h
new file mode 100644
index 0000000..3687c3d
--- /dev/null
+++ b/riscv/insns/vwsubu_wv.h
@@ -0,0 +1,6 @@
+// vwsubu.wv vd, vs2, vs1
+VI_CHECK_DDS(true);
+VI_VV_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(vs1, -, uint);
+})
diff --git a/riscv/insns/vwsubu_wx.h b/riscv/insns/vwsubu_wx.h
new file mode 100644
index 0000000..c7f20ed
--- /dev/null
+++ b/riscv/insns/vwsubu_wx.h
@@ -0,0 +1,6 @@
+// vwsubu.wx vd, vs2, rs1
+VI_CHECK_DDS(false);
+VI_VX_LOOP_WIDEN
+({
+  VI_WIDE_WVX_OP(rs1, -, uint);
+})
diff --git a/riscv/insns/vxor_vi.h b/riscv/insns/vxor_vi.h
new file mode 100644
index 0000000..b2dcf94
--- /dev/null
+++ b/riscv/insns/vxor_vi.h
@@ -0,0 +1,5 @@
+// vxor
+VI_VI_LOOP
+({
+  vd = simm5 ^ vs2;
+})
diff --git a/riscv/insns/vxor_vv.h b/riscv/insns/vxor_vv.h
new file mode 100644
index 0000000..c37b6ab
--- /dev/null
+++ b/riscv/insns/vxor_vv.h
@@ -0,0 +1,5 @@
+// vxor
+VI_VV_LOOP
+({
+  vd = vs1 ^ vs2;
+})
diff --git a/riscv/insns/vxor_vx.h b/riscv/insns/vxor_vx.h
new file mode 100644
index 0000000..8021e0e
--- /dev/null
+++ b/riscv/insns/vxor_vx.h
@@ -0,0 +1,5 @@
+// vxor
+VI_VX_LOOP
+({
+  vd = rs1 ^ vs2;
+})
diff --git a/riscv/riscv.mk.in b/riscv/riscv.mk.in
index 4d538c8..e8c7f04 100644
--- a/riscv/riscv.mk.in
+++ b/riscv/riscv.mk.in
@@ -291,11 +291,217 @@ riscv_insn_ext_q = \
 	fsqrt_q \
 	fsub_q \
 
+riscv_insn_ext_v_alu_int = \
+	vaadd_vi \
+	vaadd_vv \
+	vaadd_vx \
+	vadc_vim \
+	vadc_vvm \
+	vadc_vxm \
+	vadd_vi \
+	vadd_vv \
+	vadd_vx \
+	vand_vi \
+	vand_vv \
+	vand_vx \
+	vasub_vv \
+	vasub_vx \
+	vcompress_vm \
+	vdiv_vv \
+	vdiv_vx \
+	vdivu_vv \
+	vdivu_vx \
+	vdot_vv \
+	vdotu_vv \
+	vext_x_v \
+	vid_v \
+	viota_m \
+	vmacc_vv \
+	vmacc_vx \
+	vmadc_vim \
+	vmadc_vvm \
+	vmadc_vxm \
+	vmadd_vv \
+	vmadd_vx \
+	vmand_mm \
+	vmandnot_mm \
+	vmax_vv \
+	vmax_vx \
+	vmaxu_vv \
+	vmaxu_vx \
+	vmerge_vim \
+	vmerge_vvm \
+	vmerge_vxm \
+	vmfirst_m \
+	vmin_vv \
+	vmin_vx \
+	vminu_vv \
+	vminu_vx \
+	vmnand_mm \
+	vmnor_mm \
+	vmor_mm \
+	vmornot_mm \
+	vmpopc_m \
+	vmsbc_vvm \
+	vmsbc_vxm \
+	vmsbf_m \
+	vmseq_vi \
+	vmseq_vv \
+	vmseq_vx \
+	vmsgt_vi \
+	vmsgt_vx \
+	vmsgtu_vi \
+	vmsgtu_vx \
+	vmsif_m \
+	vmsle_vi \
+	vmsle_vv \
+	vmsle_vx \
+	vmsleu_vi \
+	vmsleu_vv \
+	vmsleu_vx \
+	vmslt_vv \
+	vmslt_vx \
+	vmsltu_vv \
+	vmsltu_vx \
+	vmsne_vi \
+	vmsne_vv \
+	vmsne_vx \
+	vmsof_m \
+	vmul_vv \
+	vmul_vx \
+	vmulh_vv \
+	vmulh_vx \
+	vmulhsu_vv \
+	vmulhsu_vx \
+	vmulhu_vv \
+	vmulhu_vx \
+	vmv_s_x \
+	vmv_v_i \
+	vmv_v_v \
+	vmv_v_x \
+	vmxnor_mm \
+	vmxor_mm \
+	vnclip_vi \
+	vnclip_vv \
+	vnclip_vx \
+	vnclipu_vi \
+	vnclipu_vv \
+	vnclipu_vx \
+	vnmsac_vv \
+	vnmsac_vx \
+	vnmsub_vv \
+	vnmsub_vx \
+	vnsra_vi \
+	vnsra_vv \
+	vnsra_vx \
+	vnsrl_vi \
+	vnsrl_vv \
+	vnsrl_vx \
+	vor_vi \
+	vor_vv \
+	vor_vx \
+	vredand_vs \
+	vredmax_vs \
+	vredmaxu_vs \
+	vredmin_vs \
+	vredminu_vs \
+	vredor_vs \
+	vredsum_vs \
+	vredxor_vs \
+	vrem_vv \
+	vrem_vx \
+	vremu_vv \
+	vremu_vx \
+	vrgather_vi \
+	vrgather_vv \
+	vrgather_vx \
+	vrsub_vi \
+	vrsub_vx \
+	vsadd_vi \
+	vsadd_vv \
+	vsadd_vx \
+	vsaddu_vi \
+	vsaddu_vv \
+	vsaddu_vx \
+	vsbc_vvm \
+	vsbc_vxm \
+	vslide1down_vx \
+	vslide1up_vx \
+	vslidedown_vi \
+	vslidedown_vx \
+	vslideup_vi \
+	vslideup_vx \
+	vsll_vi \
+	vsll_vv \
+	vsll_vx \
+	vsmul_vv \
+	vsmul_vx \
+	vsra_vi \
+	vsra_vv \
+	vsra_vx \
+	vsrl_vi \
+	vsrl_vv \
+	vsrl_vx \
+	vssra_vi \
+	vssra_vv \
+	vssra_vx \
+	vssrl_vi \
+	vssrl_vv \
+	vssrl_vx \
+	vssub_vv \
+	vssub_vx \
+	vssubu_vv \
+	vssubu_vx \
+	vsub_vv \
+	vsub_vx \
+	vwadd_vv \
+	vwadd_vx \
+	vwadd_wv \
+	vwadd_wx \
+	vwaddu_vv \
+	vwaddu_vx \
+	vwaddu_wv \
+	vwaddu_wx \
+	vwmacc_vv \
+	vwmacc_vx \
+	vwmaccsu_vv \
+	vwmaccsu_vx \
+	vwmaccu_vv \
+	vwmaccu_vx \
+	vwmaccus_vx \
+	vwmul_vv \
+	vwmul_vx \
+	vwmulsu_vv \
+	vwmulsu_vx \
+	vwmulu_vv \
+	vwmulu_vx \
+	vwredsum_vs \
+	vwredsumu_vs \
+	vwsmacc_vv \
+	vwsmacc_vx \
+	vwsmaccsu_vv \
+	vwsmaccsu_vx \
+	vwsmaccu_vv \
+	vwsmaccu_vx \
+	vwsmaccus_vx \
+	vwsub_vv \
+	vwsub_vx \
+	vwsub_wv \
+	vwsub_wx \
+	vwsubu_vv \
+	vwsubu_vx \
+	vwsubu_wv \
+	vwsubu_wx \
+	vxor_vi \
+	vxor_vv \
+	vxor_vx \
+
 riscv_insn_ext_v_ctrl = \
 	vsetvli \
 	vsetvl \
 
 riscv_insn_ext_v = \
+	$(riscv_insn_ext_v_alu_int) \
 	$(riscv_insn_ext_v_ctrl) \
 
 riscv_insn_priv = \
author	Chih-Min Chao <chihmin.chao@sifive.com>	2019-06-06 03:24:27 -0700
committer	Chih-Min Chao <chihmin.chao@sifive.com>	2019-06-18 08:56:11 -0700
commit	655aedc0ebd2326d69d389bc714c2d622bf2cb08 (patch)
tree	aa2cf79905906cde9ff6d10c63d1499fb4a484a1
parent	235aa58bfb439c9782defe8bdd21f792e40aac31 (diff)
download	spike-655aedc0ebd2326d69d389bc714c2d622bf2cb08.zip spike-655aedc0ebd2326d69d389bc714c2d622bf2cb08.tar.gz spike-655aedc0ebd2326d69d389bc714c2d622bf2cb08.tar.bz2