aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/RISCV
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/RISCV')
-rw-r--r--llvm/test/CodeGen/RISCV/abds.ll216
-rw-r--r--llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll14
-rw-r--r--llvm/test/CodeGen/RISCV/combine-storetomstore.ll684
-rw-r--r--llvm/test/CodeGen/RISCV/features-info.ll10
-rw-r--r--llvm/test/CodeGen/RISCV/fpclamptosat.ll208
-rw-r--r--llvm/test/CodeGen/RISCV/half-convert.ll108
-rw-r--r--llvm/test/CodeGen/RISCV/iabs.ll80
-rw-r--r--llvm/test/CodeGen/RISCV/macro-fusions.mir1376
-rw-r--r--llvm/test/CodeGen/RISCV/misched-load-clustering.ll47
-rw-r--r--llvm/test/CodeGen/RISCV/misched-mem-clustering.mir6
-rw-r--r--llvm/test/CodeGen/RISCV/misched-store-clustering.ll83
-rw-r--r--llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll9
-rw-r--r--llvm/test/CodeGen/RISCV/rnmi-interrupt-attr-error.ll9
-rw-r--r--llvm/test/CodeGen/RISCV/rnmi-interrupt-attr.ll373
-rw-r--r--llvm/test/CodeGen/RISCV/rv32zbb.ll56
-rw-r--r--llvm/test/CodeGen/RISCV/rv32zbkb.ll139
-rw-r--r--llvm/test/CodeGen/RISCV/rv32zbs.ll12
-rw-r--r--llvm/test/CodeGen/RISCV/rv64-half-convert.ll21
-rw-r--r--llvm/test/CodeGen/RISCV/rv64zbkb.ll214
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll586
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fp4-bitcast.ll33
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll340
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll21
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll31
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll42
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vp-splice.ll136
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vploadff.ll1008
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll10
-rw-r--r--llvm/test/CodeGen/RISCV/unaligned-load-store.ll20
-rw-r--r--llvm/test/CodeGen/RISCV/xqcilsm-memset.ll900
30 files changed, 6146 insertions, 646 deletions
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 28a95ef..f11a9c8 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -2011,50 +2011,50 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
; RV32I-LABEL: abd_subnsw_i128:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a4, 0(a2)
-; RV32I-NEXT: lw a3, 4(a2)
+; RV32I-NEXT: lw a3, 0(a2)
+; RV32I-NEXT: lw a4, 4(a2)
; RV32I-NEXT: lw a5, 8(a2)
-; RV32I-NEXT: lw a6, 12(a2)
+; RV32I-NEXT: lw a2, 12(a2)
; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: lw t0, 12(a1)
-; RV32I-NEXT: lw a2, 0(a1)
+; RV32I-NEXT: lw a6, 0(a1)
; RV32I-NEXT: lw a1, 4(a1)
; RV32I-NEXT: sltu t1, a7, a5
-; RV32I-NEXT: sub t0, t0, a6
-; RV32I-NEXT: sltu a6, a2, a4
+; RV32I-NEXT: sub t0, t0, a2
+; RV32I-NEXT: sltu a2, a6, a3
; RV32I-NEXT: sub t0, t0, t1
-; RV32I-NEXT: mv t1, a6
-; RV32I-NEXT: beq a1, a3, .LBB31_2
+; RV32I-NEXT: mv t1, a2
+; RV32I-NEXT: beq a1, a4, .LBB31_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: sltu t1, a1, a3
+; RV32I-NEXT: sltu t1, a1, a4
; RV32I-NEXT: .LBB31_2:
; RV32I-NEXT: sub a5, a7, a5
-; RV32I-NEXT: sub a3, a1, a3
-; RV32I-NEXT: sltu a1, a5, t1
+; RV32I-NEXT: sub a1, a1, a4
+; RV32I-NEXT: sltu a4, a5, t1
; RV32I-NEXT: sub a5, a5, t1
-; RV32I-NEXT: sub a1, t0, a1
-; RV32I-NEXT: sub a3, a3, a6
-; RV32I-NEXT: sub a2, a2, a4
-; RV32I-NEXT: bgez a1, .LBB31_4
+; RV32I-NEXT: sub a4, t0, a4
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: sub a1, a6, a3
+; RV32I-NEXT: bgez a4, .LBB31_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: snez a4, a3
-; RV32I-NEXT: snez a6, a2
+; RV32I-NEXT: snez a3, a2
+; RV32I-NEXT: snez a6, a1
; RV32I-NEXT: neg a7, a5
; RV32I-NEXT: snez a5, a5
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: add a4, a4, a5
+; RV32I-NEXT: add a2, a2, a6
+; RV32I-NEXT: sltu a6, a7, a3
+; RV32I-NEXT: neg a4, a4
+; RV32I-NEXT: sub a5, a7, a3
; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: add a1, a1, a5
-; RV32I-NEXT: add a3, a3, a6
-; RV32I-NEXT: sltu a6, a7, a4
+; RV32I-NEXT: sub a4, a4, a6
; RV32I-NEXT: neg a1, a1
-; RV32I-NEXT: sub a5, a7, a4
-; RV32I-NEXT: sub a1, a1, a6
-; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: .LBB31_4:
-; RV32I-NEXT: sw a2, 0(a0)
-; RV32I-NEXT: sw a3, 4(a0)
+; RV32I-NEXT: sw a1, 0(a0)
+; RV32I-NEXT: sw a2, 4(a0)
; RV32I-NEXT: sw a5, 8(a0)
-; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: sw a4, 12(a0)
; RV32I-NEXT: ret
;
; RV64I-LABEL: abd_subnsw_i128:
@@ -2074,50 +2074,50 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
;
; RV32ZBB-LABEL: abd_subnsw_i128:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a4, 0(a2)
-; RV32ZBB-NEXT: lw a3, 4(a2)
+; RV32ZBB-NEXT: lw a3, 0(a2)
+; RV32ZBB-NEXT: lw a4, 4(a2)
; RV32ZBB-NEXT: lw a5, 8(a2)
-; RV32ZBB-NEXT: lw a6, 12(a2)
+; RV32ZBB-NEXT: lw a2, 12(a2)
; RV32ZBB-NEXT: lw a7, 8(a1)
; RV32ZBB-NEXT: lw t0, 12(a1)
-; RV32ZBB-NEXT: lw a2, 0(a1)
+; RV32ZBB-NEXT: lw a6, 0(a1)
; RV32ZBB-NEXT: lw a1, 4(a1)
; RV32ZBB-NEXT: sltu t1, a7, a5
-; RV32ZBB-NEXT: sub t0, t0, a6
-; RV32ZBB-NEXT: sltu a6, a2, a4
+; RV32ZBB-NEXT: sub t0, t0, a2
+; RV32ZBB-NEXT: sltu a2, a6, a3
; RV32ZBB-NEXT: sub t0, t0, t1
-; RV32ZBB-NEXT: mv t1, a6
-; RV32ZBB-NEXT: beq a1, a3, .LBB31_2
+; RV32ZBB-NEXT: mv t1, a2
+; RV32ZBB-NEXT: beq a1, a4, .LBB31_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: sltu t1, a1, a3
+; RV32ZBB-NEXT: sltu t1, a1, a4
; RV32ZBB-NEXT: .LBB31_2:
; RV32ZBB-NEXT: sub a5, a7, a5
-; RV32ZBB-NEXT: sub a3, a1, a3
-; RV32ZBB-NEXT: sltu a1, a5, t1
+; RV32ZBB-NEXT: sub a1, a1, a4
+; RV32ZBB-NEXT: sltu a4, a5, t1
; RV32ZBB-NEXT: sub a5, a5, t1
-; RV32ZBB-NEXT: sub a1, t0, a1
-; RV32ZBB-NEXT: sub a3, a3, a6
-; RV32ZBB-NEXT: sub a2, a2, a4
-; RV32ZBB-NEXT: bgez a1, .LBB31_4
+; RV32ZBB-NEXT: sub a4, t0, a4
+; RV32ZBB-NEXT: sub a2, a1, a2
+; RV32ZBB-NEXT: sub a1, a6, a3
+; RV32ZBB-NEXT: bgez a4, .LBB31_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: snez a4, a3
-; RV32ZBB-NEXT: snez a6, a2
+; RV32ZBB-NEXT: snez a3, a2
+; RV32ZBB-NEXT: snez a6, a1
; RV32ZBB-NEXT: neg a7, a5
; RV32ZBB-NEXT: snez a5, a5
+; RV32ZBB-NEXT: or a3, a6, a3
+; RV32ZBB-NEXT: add a4, a4, a5
+; RV32ZBB-NEXT: add a2, a2, a6
+; RV32ZBB-NEXT: sltu a6, a7, a3
+; RV32ZBB-NEXT: neg a4, a4
+; RV32ZBB-NEXT: sub a5, a7, a3
; RV32ZBB-NEXT: neg a2, a2
-; RV32ZBB-NEXT: or a4, a6, a4
-; RV32ZBB-NEXT: add a1, a1, a5
-; RV32ZBB-NEXT: add a3, a3, a6
-; RV32ZBB-NEXT: sltu a6, a7, a4
+; RV32ZBB-NEXT: sub a4, a4, a6
; RV32ZBB-NEXT: neg a1, a1
-; RV32ZBB-NEXT: sub a5, a7, a4
-; RV32ZBB-NEXT: sub a1, a1, a6
-; RV32ZBB-NEXT: neg a3, a3
; RV32ZBB-NEXT: .LBB31_4:
-; RV32ZBB-NEXT: sw a2, 0(a0)
-; RV32ZBB-NEXT: sw a3, 4(a0)
+; RV32ZBB-NEXT: sw a1, 0(a0)
+; RV32ZBB-NEXT: sw a2, 4(a0)
; RV32ZBB-NEXT: sw a5, 8(a0)
-; RV32ZBB-NEXT: sw a1, 12(a0)
+; RV32ZBB-NEXT: sw a4, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: abd_subnsw_i128:
@@ -2142,50 +2142,50 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
; RV32I-LABEL: abd_subnsw_i128_undef:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a4, 0(a2)
-; RV32I-NEXT: lw a3, 4(a2)
+; RV32I-NEXT: lw a3, 0(a2)
+; RV32I-NEXT: lw a4, 4(a2)
; RV32I-NEXT: lw a5, 8(a2)
-; RV32I-NEXT: lw a6, 12(a2)
+; RV32I-NEXT: lw a2, 12(a2)
; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: lw t0, 12(a1)
-; RV32I-NEXT: lw a2, 0(a1)
+; RV32I-NEXT: lw a6, 0(a1)
; RV32I-NEXT: lw a1, 4(a1)
; RV32I-NEXT: sltu t1, a7, a5
-; RV32I-NEXT: sub t0, t0, a6
-; RV32I-NEXT: sltu a6, a2, a4
+; RV32I-NEXT: sub t0, t0, a2
+; RV32I-NEXT: sltu a2, a6, a3
; RV32I-NEXT: sub t0, t0, t1
-; RV32I-NEXT: mv t1, a6
-; RV32I-NEXT: beq a1, a3, .LBB32_2
+; RV32I-NEXT: mv t1, a2
+; RV32I-NEXT: beq a1, a4, .LBB32_2
; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: sltu t1, a1, a3
+; RV32I-NEXT: sltu t1, a1, a4
; RV32I-NEXT: .LBB32_2:
; RV32I-NEXT: sub a5, a7, a5
-; RV32I-NEXT: sub a3, a1, a3
-; RV32I-NEXT: sltu a1, a5, t1
+; RV32I-NEXT: sub a1, a1, a4
+; RV32I-NEXT: sltu a4, a5, t1
; RV32I-NEXT: sub a5, a5, t1
-; RV32I-NEXT: sub a1, t0, a1
-; RV32I-NEXT: sub a3, a3, a6
-; RV32I-NEXT: sub a2, a2, a4
-; RV32I-NEXT: bgez a1, .LBB32_4
+; RV32I-NEXT: sub a4, t0, a4
+; RV32I-NEXT: sub a2, a1, a2
+; RV32I-NEXT: sub a1, a6, a3
+; RV32I-NEXT: bgez a4, .LBB32_4
; RV32I-NEXT: # %bb.3:
-; RV32I-NEXT: snez a4, a3
-; RV32I-NEXT: snez a6, a2
+; RV32I-NEXT: snez a3, a2
+; RV32I-NEXT: snez a6, a1
; RV32I-NEXT: neg a7, a5
; RV32I-NEXT: snez a5, a5
+; RV32I-NEXT: or a3, a6, a3
+; RV32I-NEXT: add a4, a4, a5
+; RV32I-NEXT: add a2, a2, a6
+; RV32I-NEXT: sltu a6, a7, a3
+; RV32I-NEXT: neg a4, a4
+; RV32I-NEXT: sub a5, a7, a3
; RV32I-NEXT: neg a2, a2
-; RV32I-NEXT: or a4, a6, a4
-; RV32I-NEXT: add a1, a1, a5
-; RV32I-NEXT: add a3, a3, a6
-; RV32I-NEXT: sltu a6, a7, a4
+; RV32I-NEXT: sub a4, a4, a6
; RV32I-NEXT: neg a1, a1
-; RV32I-NEXT: sub a5, a7, a4
-; RV32I-NEXT: sub a1, a1, a6
-; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: .LBB32_4:
-; RV32I-NEXT: sw a2, 0(a0)
-; RV32I-NEXT: sw a3, 4(a0)
+; RV32I-NEXT: sw a1, 0(a0)
+; RV32I-NEXT: sw a2, 4(a0)
; RV32I-NEXT: sw a5, 8(a0)
-; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: sw a4, 12(a0)
; RV32I-NEXT: ret
;
; RV64I-LABEL: abd_subnsw_i128_undef:
@@ -2205,50 +2205,50 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
;
; RV32ZBB-LABEL: abd_subnsw_i128_undef:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a4, 0(a2)
-; RV32ZBB-NEXT: lw a3, 4(a2)
+; RV32ZBB-NEXT: lw a3, 0(a2)
+; RV32ZBB-NEXT: lw a4, 4(a2)
; RV32ZBB-NEXT: lw a5, 8(a2)
-; RV32ZBB-NEXT: lw a6, 12(a2)
+; RV32ZBB-NEXT: lw a2, 12(a2)
; RV32ZBB-NEXT: lw a7, 8(a1)
; RV32ZBB-NEXT: lw t0, 12(a1)
-; RV32ZBB-NEXT: lw a2, 0(a1)
+; RV32ZBB-NEXT: lw a6, 0(a1)
; RV32ZBB-NEXT: lw a1, 4(a1)
; RV32ZBB-NEXT: sltu t1, a7, a5
-; RV32ZBB-NEXT: sub t0, t0, a6
-; RV32ZBB-NEXT: sltu a6, a2, a4
+; RV32ZBB-NEXT: sub t0, t0, a2
+; RV32ZBB-NEXT: sltu a2, a6, a3
; RV32ZBB-NEXT: sub t0, t0, t1
-; RV32ZBB-NEXT: mv t1, a6
-; RV32ZBB-NEXT: beq a1, a3, .LBB32_2
+; RV32ZBB-NEXT: mv t1, a2
+; RV32ZBB-NEXT: beq a1, a4, .LBB32_2
; RV32ZBB-NEXT: # %bb.1:
-; RV32ZBB-NEXT: sltu t1, a1, a3
+; RV32ZBB-NEXT: sltu t1, a1, a4
; RV32ZBB-NEXT: .LBB32_2:
; RV32ZBB-NEXT: sub a5, a7, a5
-; RV32ZBB-NEXT: sub a3, a1, a3
-; RV32ZBB-NEXT: sltu a1, a5, t1
+; RV32ZBB-NEXT: sub a1, a1, a4
+; RV32ZBB-NEXT: sltu a4, a5, t1
; RV32ZBB-NEXT: sub a5, a5, t1
-; RV32ZBB-NEXT: sub a1, t0, a1
-; RV32ZBB-NEXT: sub a3, a3, a6
-; RV32ZBB-NEXT: sub a2, a2, a4
-; RV32ZBB-NEXT: bgez a1, .LBB32_4
+; RV32ZBB-NEXT: sub a4, t0, a4
+; RV32ZBB-NEXT: sub a2, a1, a2
+; RV32ZBB-NEXT: sub a1, a6, a3
+; RV32ZBB-NEXT: bgez a4, .LBB32_4
; RV32ZBB-NEXT: # %bb.3:
-; RV32ZBB-NEXT: snez a4, a3
-; RV32ZBB-NEXT: snez a6, a2
+; RV32ZBB-NEXT: snez a3, a2
+; RV32ZBB-NEXT: snez a6, a1
; RV32ZBB-NEXT: neg a7, a5
; RV32ZBB-NEXT: snez a5, a5
+; RV32ZBB-NEXT: or a3, a6, a3
+; RV32ZBB-NEXT: add a4, a4, a5
+; RV32ZBB-NEXT: add a2, a2, a6
+; RV32ZBB-NEXT: sltu a6, a7, a3
+; RV32ZBB-NEXT: neg a4, a4
+; RV32ZBB-NEXT: sub a5, a7, a3
; RV32ZBB-NEXT: neg a2, a2
-; RV32ZBB-NEXT: or a4, a6, a4
-; RV32ZBB-NEXT: add a1, a1, a5
-; RV32ZBB-NEXT: add a3, a3, a6
-; RV32ZBB-NEXT: sltu a6, a7, a4
+; RV32ZBB-NEXT: sub a4, a4, a6
; RV32ZBB-NEXT: neg a1, a1
-; RV32ZBB-NEXT: sub a5, a7, a4
-; RV32ZBB-NEXT: sub a1, a1, a6
-; RV32ZBB-NEXT: neg a3, a3
; RV32ZBB-NEXT: .LBB32_4:
-; RV32ZBB-NEXT: sw a2, 0(a0)
-; RV32ZBB-NEXT: sw a3, 4(a0)
+; RV32ZBB-NEXT: sw a1, 0(a0)
+; RV32ZBB-NEXT: sw a2, 4(a0)
; RV32ZBB-NEXT: sw a5, 8(a0)
-; RV32ZBB-NEXT: sw a1, 12(a0)
+; RV32ZBB-NEXT: sw a4, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: abd_subnsw_i128_undef:
diff --git a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
new file mode 100644
index 0000000..be3de37
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s
+
+define i1 @src(i64 %x) {
+; CHECK-LABEL: src:
+; CHECK: # %bb.0:
+; CHECK-NEXT: srai a0, a0, 30
+; CHECK-NEXT: addi a0, a0, 2
+; CHECK-NEXT: seqz a0, a0
+; CHECK-NEXT: ret
+ %a = and i64 %x, -1073741824
+ %b = icmp eq i64 %a, -2147483648
+ ret i1 %b
+}
diff --git a/llvm/test/CodeGen/RISCV/combine-storetomstore.ll b/llvm/test/CodeGen/RISCV/combine-storetomstore.ll
new file mode 100644
index 0000000..c7d1f76
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/combine-storetomstore.ll
@@ -0,0 +1,684 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64-- -mattr=+m,+v,+f | FileCheck %s -check-prefix=RISCV
+
+define void @test_masked_store_success_v4i8(<4 x i8> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v4i8:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <4 x i8>, ptr %ptr, align 32
+ %sel = select <4 x i1> %mask, <4 x i8> %x, <4 x i8> %load
+ store <4 x i8> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v4i16:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RISCV-NEXT: vse16.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <4 x i16>, ptr %ptr, align 32
+ %sel = select <4 x i1> %mask, <4 x i16> %x, <4 x i16> %load
+ store <4 x i16> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v4i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <4 x i32>, ptr %ptr, align 32
+ %sel = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %load
+ store <4 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v4i64(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v4i64:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RISCV-NEXT: vse64.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <4 x i64>, ptr %ptr, align 32
+ %sel = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %load
+ store <4 x i64> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v4f16(<4 x half> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v4f16:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RISCV-NEXT: vmv1r.v v9, v0
+; RISCV-NEXT: vfirst.m a3, v0
+; RISCV-NEXT: mv a2, a0
+; RISCV-NEXT: beqz a3, .LBB4_2
+; RISCV-NEXT: # %bb.1:
+; RISCV-NEXT: mv a2, a1
+; RISCV-NEXT: .LBB4_2:
+; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RISCV-NEXT: vmv.v.i v8, 0
+; RISCV-NEXT: vmv1r.v v0, v9
+; RISCV-NEXT: vmerge.vim v8, v8, 1, v0
+; RISCV-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RISCV-NEXT: vslidedown.vi v8, v8, 2
+; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RISCV-NEXT: vmsne.vi v8, v8, 0
+; RISCV-NEXT: vmv.v.i v10, 0
+; RISCV-NEXT: vmv1r.v v0, v8
+; RISCV-NEXT: vmerge.vim v11, v10, 1, v0
+; RISCV-NEXT: vslidedown.vi v11, v11, 1
+; RISCV-NEXT: vmv.x.s a3, v11
+; RISCV-NEXT: andi a3, a3, 1
+; RISCV-NEXT: bnez a3, .LBB4_4
+; RISCV-NEXT: # %bb.3:
+; RISCV-NEXT: addi a3, a1, 6
+; RISCV-NEXT: j .LBB4_5
+; RISCV-NEXT: .LBB4_4:
+; RISCV-NEXT: addi a3, a0, 24
+; RISCV-NEXT: .LBB4_5:
+; RISCV-NEXT: vmv1r.v v0, v9
+; RISCV-NEXT: vmerge.vim v9, v10, 1, v0
+; RISCV-NEXT: vslidedown.vi v9, v9, 1
+; RISCV-NEXT: vmv.x.s a4, v9
+; RISCV-NEXT: andi a4, a4, 1
+; RISCV-NEXT: bnez a4, .LBB4_7
+; RISCV-NEXT: # %bb.6:
+; RISCV-NEXT: addi a5, a1, 2
+; RISCV-NEXT: j .LBB4_8
+; RISCV-NEXT: .LBB4_7:
+; RISCV-NEXT: addi a5, a0, 8
+; RISCV-NEXT: .LBB4_8:
+; RISCV-NEXT: lh a4, 0(a2)
+; RISCV-NEXT: lh a2, 0(a3)
+; RISCV-NEXT: lh a3, 0(a5)
+; RISCV-NEXT: vfirst.m a5, v8
+; RISCV-NEXT: beqz a5, .LBB4_10
+; RISCV-NEXT: # %bb.9:
+; RISCV-NEXT: addi a0, a1, 4
+; RISCV-NEXT: j .LBB4_11
+; RISCV-NEXT: .LBB4_10:
+; RISCV-NEXT: addi a0, a0, 16
+; RISCV-NEXT: .LBB4_11:
+; RISCV-NEXT: lh a0, 0(a0)
+; RISCV-NEXT: sh a4, 0(a1)
+; RISCV-NEXT: sh a3, 2(a1)
+; RISCV-NEXT: sh a0, 4(a1)
+; RISCV-NEXT: sh a2, 6(a1)
+; RISCV-NEXT: ret
+ %load = load <4 x half>, ptr %ptr, align 32
+ %sel = select <4 x i1> %mask, <4 x half> %x, <4 x half> %load
+ store <4 x half> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v4f32(<4 x float> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v4f32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <4 x float>, ptr %ptr, align 32
+ %sel = select <4 x i1> %mask, <4 x float> %x, <4 x float> %load
+ store <4 x float> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v4f64(<4 x double> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v4f64:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RISCV-NEXT: vse64.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <4 x double>, ptr %ptr, align 32
+ %sel = select <4 x i1> %mask, <4 x double> %x, <4 x double> %load
+ store <4 x double> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v8i8(<8 x i8> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v8i8:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <8 x i8>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x i8> %x, <8 x i8> %load
+ store <8 x i8> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v8i16(<8 x i16> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v8i16:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; RISCV-NEXT: vse16.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <8 x i16>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %load
+ store <8 x i16> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v8i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <8 x i32>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load
+ store <8 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v8i64(<8 x i64> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v8i64:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RISCV-NEXT: vse64.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <8 x i64>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load
+ store <8 x i64> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v8f16(<8 x half> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v8f16:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RISCV-NEXT: vmv1r.v v8, v0
+; RISCV-NEXT: vfirst.m a3, v0
+; RISCV-NEXT: mv a2, a0
+; RISCV-NEXT: beqz a3, .LBB11_2
+; RISCV-NEXT: # %bb.1:
+; RISCV-NEXT: mv a2, a1
+; RISCV-NEXT: .LBB11_2:
+; RISCV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RISCV-NEXT: vmv.v.i v9, 0
+; RISCV-NEXT: vmv1r.v v0, v8
+; RISCV-NEXT: vmerge.vim v9, v9, 1, v0
+; RISCV-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; RISCV-NEXT: vslidedown.vi v9, v9, 4
+; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RISCV-NEXT: vmsne.vi v11, v9, 0
+; RISCV-NEXT: vmv.v.i v10, 0
+; RISCV-NEXT: vmv1r.v v0, v11
+; RISCV-NEXT: vmerge.vim v9, v10, 1, v0
+; RISCV-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RISCV-NEXT: vslidedown.vi v9, v9, 2
+; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RISCV-NEXT: vmsne.vi v9, v9, 0
+; RISCV-NEXT: vmv.v.i v12, 0
+; RISCV-NEXT: vmv1r.v v0, v9
+; RISCV-NEXT: vmerge.vim v13, v12, 1, v0
+; RISCV-NEXT: vslidedown.vi v13, v13, 1
+; RISCV-NEXT: vmv.x.s a3, v13
+; RISCV-NEXT: andi a3, a3, 1
+; RISCV-NEXT: bnez a3, .LBB11_4
+; RISCV-NEXT: # %bb.3:
+; RISCV-NEXT: addi a3, a1, 14
+; RISCV-NEXT: j .LBB11_5
+; RISCV-NEXT: .LBB11_4:
+; RISCV-NEXT: addi a3, a0, 56
+; RISCV-NEXT: .LBB11_5:
+; RISCV-NEXT: vmv1r.v v0, v8
+; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RISCV-NEXT: vmerge.vim v10, v10, 1, v0
+; RISCV-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RISCV-NEXT: vslidedown.vi v10, v10, 2
+; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RISCV-NEXT: vmsne.vi v10, v10, 0
+; RISCV-NEXT: vmv1r.v v0, v10
+; RISCV-NEXT: vmerge.vim v13, v12, 1, v0
+; RISCV-NEXT: vslidedown.vi v13, v13, 1
+; RISCV-NEXT: vmv.x.s a4, v13
+; RISCV-NEXT: andi a4, a4, 1
+; RISCV-NEXT: bnez a4, .LBB11_8
+; RISCV-NEXT: # %bb.6:
+; RISCV-NEXT: addi a4, a1, 6
+; RISCV-NEXT: vfirst.m a5, v11
+; RISCV-NEXT: bnez a5, .LBB11_9
+; RISCV-NEXT: .LBB11_7:
+; RISCV-NEXT: addi a5, a0, 32
+; RISCV-NEXT: j .LBB11_10
+; RISCV-NEXT: .LBB11_8:
+; RISCV-NEXT: addi a4, a0, 24
+; RISCV-NEXT: vfirst.m a5, v11
+; RISCV-NEXT: beqz a5, .LBB11_7
+; RISCV-NEXT: .LBB11_9:
+; RISCV-NEXT: addi a5, a1, 8
+; RISCV-NEXT: .LBB11_10:
+; RISCV-NEXT: vmv1r.v v0, v11
+; RISCV-NEXT: vmerge.vim v11, v12, 1, v0
+; RISCV-NEXT: vslidedown.vi v11, v11, 1
+; RISCV-NEXT: vmv.x.s a6, v11
+; RISCV-NEXT: andi a6, a6, 1
+; RISCV-NEXT: bnez a6, .LBB11_14
+; RISCV-NEXT: # %bb.11:
+; RISCV-NEXT: addi a6, a1, 10
+; RISCV-NEXT: vfirst.m a7, v9
+; RISCV-NEXT: bnez a7, .LBB11_15
+; RISCV-NEXT: .LBB11_12:
+; RISCV-NEXT: addi a7, a0, 48
+; RISCV-NEXT: vfirst.m t0, v10
+; RISCV-NEXT: bnez t0, .LBB11_16
+; RISCV-NEXT: .LBB11_13:
+; RISCV-NEXT: addi t1, a0, 16
+; RISCV-NEXT: j .LBB11_17
+; RISCV-NEXT: .LBB11_14:
+; RISCV-NEXT: addi a6, a0, 40
+; RISCV-NEXT: vfirst.m a7, v9
+; RISCV-NEXT: beqz a7, .LBB11_12
+; RISCV-NEXT: .LBB11_15:
+; RISCV-NEXT: addi a7, a1, 12
+; RISCV-NEXT: vfirst.m t0, v10
+; RISCV-NEXT: beqz t0, .LBB11_13
+; RISCV-NEXT: .LBB11_16:
+; RISCV-NEXT: addi t1, a1, 4
+; RISCV-NEXT: .LBB11_17:
+; RISCV-NEXT: vmv1r.v v0, v8
+; RISCV-NEXT: lh t0, 0(a2)
+; RISCV-NEXT: lh a2, 0(a3)
+; RISCV-NEXT: lh a3, 0(a4)
+; RISCV-NEXT: lh a4, 0(a5)
+; RISCV-NEXT: lh a5, 0(a6)
+; RISCV-NEXT: lh a6, 0(a7)
+; RISCV-NEXT: lh a7, 0(t1)
+; RISCV-NEXT: vmerge.vim v8, v12, 1, v0
+; RISCV-NEXT: vslidedown.vi v8, v8, 1
+; RISCV-NEXT: vmv.x.s t1, v8
+; RISCV-NEXT: andi t1, t1, 1
+; RISCV-NEXT: bnez t1, .LBB11_19
+; RISCV-NEXT: # %bb.18:
+; RISCV-NEXT: addi a0, a1, 2
+; RISCV-NEXT: j .LBB11_20
+; RISCV-NEXT: .LBB11_19:
+; RISCV-NEXT: addi a0, a0, 8
+; RISCV-NEXT: .LBB11_20:
+; RISCV-NEXT: lh a0, 0(a0)
+; RISCV-NEXT: sh t0, 0(a1)
+; RISCV-NEXT: sh a0, 2(a1)
+; RISCV-NEXT: sh a7, 4(a1)
+; RISCV-NEXT: sh a3, 6(a1)
+; RISCV-NEXT: sh a4, 8(a1)
+; RISCV-NEXT: sh a5, 10(a1)
+; RISCV-NEXT: sh a6, 12(a1)
+; RISCV-NEXT: sh a2, 14(a1)
+; RISCV-NEXT: ret
+ %load = load <8 x half>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x half> %x, <8 x half> %load
+ store <8 x half> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v8f32(<8 x float> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v8f32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <8 x float>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x float> %x, <8 x float> %load
+ store <8 x float> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v8f64(<8 x double> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v8f64:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RISCV-NEXT: vse64.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <8 x double>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x double> %x, <8 x double> %load
+ store <8 x double> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v16i8(<16 x i8> %x, ptr %ptr, <16 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v16i8:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <16 x i8>, ptr %ptr, align 32
+ %sel = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %load
+ store <16 x i8> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v16i16:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; RISCV-NEXT: vse16.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <16 x i16>, ptr %ptr, align 32
+ %sel = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %load
+ store <16 x i16> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v16i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <16 x i32>, ptr %ptr, align 32
+ %sel = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %load
+ store <16 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v32i8(<32 x i8> %x, ptr %ptr, <32 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v32i8:
+; RISCV: # %bb.0:
+; RISCV-NEXT: li a1, 32
+; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <32 x i8>, ptr %ptr, align 32
+ %sel = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %load
+ store <32 x i8> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v32i16(<32 x i16> %x, ptr %ptr, <32 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v32i16:
+; RISCV: # %bb.0:
+; RISCV-NEXT: li a1, 32
+; RISCV-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; RISCV-NEXT: vse16.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <32 x i16>, ptr %ptr, align 32
+ %sel = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %load
+ store <32 x i16> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_v64i8(<64 x i8> %x, ptr %ptr, <64 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_v64i8:
+; RISCV: # %bb.0:
+; RISCV-NEXT: li a1, 64
+; RISCV-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <64 x i8>, ptr %ptr, align 32
+ %sel = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %load
+ store <64 x i8> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_invert_mask_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_invert_mask_v4i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RISCV-NEXT: vmnot.m v0, v0
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <4 x i32>, ptr %ptr, align 32
+ %sel = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> %x
+ store <4 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_invert_mask_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_invert_mask_v8i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RISCV-NEXT: vmnot.m v0, v0
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <8 x i32>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x i32> %load, <8 x i32> %x
+ store <8 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_success_invert_mask_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_success_invert_mask_v16i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RISCV-NEXT: vmnot.m v0, v0
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: ret
+ %load = load <16 x i32>, ptr %ptr, align 32
+ %sel = select <16 x i1> %mask, <16 x i32> %load, <16 x i32> %x
+ store <16 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_zextload(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_zextload:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RISCV-NEXT: vle32.v v12, (a0)
+; RISCV-NEXT: vzext.vf2 v10, v12
+; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0
+; RISCV-NEXT: vse64.v v8, (a0)
+; RISCV-NEXT: ret
+ %load = load <4 x i32>, ptr %ptr, align 32
+ %zext = zext <4 x i32> %load to <4 x i64>
+ %masked = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %zext
+ store <4 x i64> %masked, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_volatile_load:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RISCV-NEXT: vle32.v v10, (a0)
+; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0
+; RISCV-NEXT: vse32.v v8, (a0)
+; RISCV-NEXT: ret
+ %load = load volatile <8 x i32>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load
+ store <8 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_volatile_store:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RISCV-NEXT: vle32.v v10, (a0)
+; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0
+; RISCV-NEXT: vse32.v v8, (a0)
+; RISCV-NEXT: ret
+ %load = load <8 x i32>, ptr %ptr, align 32
+ %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load
+ store volatile <8 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+declare void @use_vec(<8 x i32>)
+
+define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) nounwind {
+; RISCV-LABEL: test_masked_store_intervening:
+; RISCV: # %bb.0:
+; RISCV-NEXT: addi sp, sp, -32
+; RISCV-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RISCV-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RISCV-NEXT: csrr a1, vlenb
+; RISCV-NEXT: slli a2, a1, 2
+; RISCV-NEXT: add a1, a2, a1
+; RISCV-NEXT: sub sp, sp, a1
+; RISCV-NEXT: csrr a1, vlenb
+; RISCV-NEXT: slli a1, a1, 2
+; RISCV-NEXT: add a1, sp, a1
+; RISCV-NEXT: addi a1, a1, 16
+; RISCV-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill
+; RISCV-NEXT: mv s0, a0
+; RISCV-NEXT: csrr a1, vlenb
+; RISCV-NEXT: slli a1, a1, 1
+; RISCV-NEXT: add a1, sp, a1
+; RISCV-NEXT: addi a1, a1, 16
+; RISCV-NEXT: vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
+; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RISCV-NEXT: vle32.v v8, (a0)
+; RISCV-NEXT: addi a1, sp, 16
+; RISCV-NEXT: vs2r.v v8, (a1) # vscale x 16-byte Folded Spill
+; RISCV-NEXT: vmv.v.i v8, 0
+; RISCV-NEXT: vse32.v v8, (a0)
+; RISCV-NEXT: call use_vec
+; RISCV-NEXT: csrr a0, vlenb
+; RISCV-NEXT: slli a0, a0, 2
+; RISCV-NEXT: add a0, sp, a0
+; RISCV-NEXT: addi a0, a0, 16
+; RISCV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
+; RISCV-NEXT: csrr a0, vlenb
+; RISCV-NEXT: slli a0, a0, 1
+; RISCV-NEXT: add a0, sp, a0
+; RISCV-NEXT: addi a0, a0, 16
+; RISCV-NEXT: vl2r.v v8, (a0) # vscale x 16-byte Folded Reload
+; RISCV-NEXT: addi a0, sp, 16
+; RISCV-NEXT: vl2r.v v10, (a0) # vscale x 16-byte Folded Reload
+; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0
+; RISCV-NEXT: vse32.v v8, (s0)
+; RISCV-NEXT: csrr a0, vlenb
+; RISCV-NEXT: slli a1, a0, 2
+; RISCV-NEXT: add a0, a1, a0
+; RISCV-NEXT: add sp, sp, a0
+; RISCV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RISCV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RISCV-NEXT: addi sp, sp, 32
+; RISCV-NEXT: ret
+ %load = load <8 x i32>, ptr %ptr, align 32
+ store <8 x i32> zeroinitializer, ptr %ptr, align 32
+ %tmp = load <8 x i32>, ptr %ptr
+ call void @use_vec(<8 x i32> %tmp)
+ %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load
+ store <8 x i32> %sel, ptr %ptr, align 32
+ ret void
+}
+
+
+define void @test_masked_store_multiple_v8i32(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) {
+; RISCV-LABEL: test_masked_store_multiple_v8i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RISCV-NEXT: vmv1r.v v13, v0
+; RISCV-NEXT: vle32.v v14, (a1)
+; RISCV-NEXT: vmv1r.v v0, v12
+; RISCV-NEXT: vmerge.vvm v10, v14, v10, v0
+; RISCV-NEXT: vmv1r.v v0, v13
+; RISCV-NEXT: vse32.v v8, (a0), v0.t
+; RISCV-NEXT: vse32.v v10, (a1)
+; RISCV-NEXT: ret
+ %load = load <8 x i32>, ptr %ptr1, align 32
+ %load2 = load <8 x i32>, ptr %ptr2, align 32
+ %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load
+ %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2
+ store <8 x i32> %sel, ptr %ptr1, align 32
+ store <8 x i32> %sel2, ptr %ptr2, align 32
+ ret void
+}
+
+define void @test_masked_store_multiple_v8i64(<8 x i64> %x, <8 x i64> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) {
+; RISCV-LABEL: test_masked_store_multiple_v8i64:
+; RISCV: # %bb.0:
+; RISCV-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RISCV-NEXT: vmv1r.v v17, v0
+; RISCV-NEXT: vle64.v v20, (a1)
+; RISCV-NEXT: vmv1r.v v0, v16
+; RISCV-NEXT: vmerge.vvm v12, v20, v12, v0
+; RISCV-NEXT: vmv1r.v v0, v17
+; RISCV-NEXT: vse64.v v8, (a0), v0.t
+; RISCV-NEXT: vse64.v v12, (a1)
+; RISCV-NEXT: ret
+ %load = load <8 x i64>, ptr %ptr1, align 32
+ %load2 = load <8 x i64>, ptr %ptr2, align 32
+ %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load
+ %sel2 = select <8 x i1> %mask2, <8 x i64> %y, <8 x i64> %load2
+ store <8 x i64> %sel, ptr %ptr1, align 32
+ store <8 x i64> %sel2, ptr %ptr2, align 32
+ ret void
+}
+
+define void @test_masked_store_unaligned_v4i32(<4 x i32> %data, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_unaligned_v4i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: addi a0, a0, 1
+; RISCV-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RISCV-NEXT: vle8.v v9, (a0)
+; RISCV-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RISCV-NEXT: vmerge.vvm v8, v9, v8, v0
+; RISCV-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0)
+; RISCV-NEXT: ret
+ %ptr_i8 = getelementptr i8, ptr %ptr, i32 1
+ %ptr_vec = bitcast ptr %ptr_i8 to ptr
+ %load = load <4 x i32>, ptr %ptr_vec, align 1
+ %sel = select <4 x i1> %mask, <4 x i32> %data, <4 x i32> %load
+ store <4 x i32> %sel, ptr %ptr_vec, align 1
+ ret void
+}
+
+define void @test_masked_store_unaligned_v4i64(<4 x i64> %data, ptr %ptr, <4 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_unaligned_v4i64:
+; RISCV: # %bb.0:
+; RISCV-NEXT: addi a0, a0, 1
+; RISCV-NEXT: li a1, 32
+; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RISCV-NEXT: vle8.v v10, (a0)
+; RISCV-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0
+; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0)
+; RISCV-NEXT: ret
+ %ptr_i8 = getelementptr i8, ptr %ptr, i64 1
+ %ptr_vec = bitcast ptr %ptr_i8 to ptr
+ %load = load <4 x i64>, ptr %ptr_vec, align 1
+ %sel = select <4 x i1> %mask, <4 x i64> %data, <4 x i64> %load
+ store <4 x i64> %sel, ptr %ptr_vec, align 1
+ ret void
+}
+
+define void @test_masked_store_unaligned_v8i32(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_unaligned_v8i32:
+; RISCV: # %bb.0:
+; RISCV-NEXT: addi a0, a0, 1
+; RISCV-NEXT: li a1, 32
+; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RISCV-NEXT: vle8.v v10, (a0)
+; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0
+; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0)
+; RISCV-NEXT: ret
+ %ptr_i8 = getelementptr i8, ptr %ptr, i32 1
+ %ptr_vec = bitcast ptr %ptr_i8 to ptr
+ %load = load <8 x i32>, ptr %ptr_vec, align 1
+ %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load
+ store <8 x i32> %sel, ptr %ptr_vec, align 1
+ ret void
+}
+
+define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i1> %mask) {
+; RISCV-LABEL: test_masked_store_unaligned_v8i64:
+; RISCV: # %bb.0:
+; RISCV-NEXT: addi a0, a0, 1
+; RISCV-NEXT: li a1, 64
+; RISCV-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RISCV-NEXT: vle8.v v12, (a0)
+; RISCV-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RISCV-NEXT: vmerge.vvm v8, v12, v8, v0
+; RISCV-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RISCV-NEXT: vse8.v v8, (a0)
+; RISCV-NEXT: ret
+ %ptr_i8 = getelementptr i8, ptr %ptr, i64 1
+ %ptr_vec = bitcast ptr %ptr_i8 to ptr
+ %load = load <8 x i64>, ptr %ptr_vec, align 1
+ %sel = select <8 x i1> %mask, <8 x i64> %data, <8 x i64> %load
+ store <8 x i64> %sel, ptr %ptr_vec, align 1
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index b94665b..fb53921 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -6,13 +6,21 @@
; CHECK-NEXT: 32bit - Implements RV32.
; CHECK-NEXT: 64bit - Implements RV64.
; CHECK-NEXT: a - 'A' (Atomic Instructions).
+; CHECK-NEXT: add-load-fusion - Enable ADD(.UW) + load macrofusion.
+; CHECK-NEXT: addi-load-fusion - Enable ADDI + load macrofusion.
; CHECK-NEXT: andes45 - Andes 45-Series processors.
; CHECK-NEXT: auipc-addi-fusion - Enable AUIPC+ADDI macrofusion.
+; CHECK-NEXT: auipc-load-fusion - Enable AUIPC + load macrofusion.
; CHECK-NEXT: b - 'B' (the collection of the Zba, Zbb, Zbs extensions).
+; CHECK-NEXT: bfext-fusion - Enable SLLI+SRLI (bitfield extract) macrofusion.
; CHECK-NEXT: c - 'C' (Compressed Instructions).
; CHECK-NEXT: conditional-cmv-fusion - Enable branch+c.mv fusion.
; CHECK-NEXT: d - 'D' (Double-Precision Floating-Point).
; CHECK-NEXT: disable-latency-sched-heuristic - Disable latency scheduling heuristic.
+; CHECK-NEXT: disable-misched-load-clustering - Disable load clustering in the machine scheduler.
+; CHECK-NEXT: disable-misched-store-clustering - Disable store clustering in the machine scheduler.
+; CHECK-NEXT: disable-postmisched-load-clustering - Disable PostRA load clustering in the machine scheduler.
+; CHECK-NEXT: disable-postmisched-store-clustering - Disable PostRA store clustering in the machine scheduler.
; CHECK-NEXT: dlen-factor-2 - Vector unit DLEN(data path width) is half of VLEN.
; CHECK-NEXT: e - 'E' (Embedded Instruction Set with 16 GPRs).
; CHECK-NEXT: exact-asm - Enable Exact Assembly (Disables Compression and Relaxation).
@@ -58,6 +66,7 @@
; CHECK-NEXT: ld-add-fusion - Enable LD+ADD macrofusion.
; CHECK-NEXT: log-vrgather - Has vrgather.vv with LMUL*log2(LMUL) latency
; CHECK-NEXT: lui-addi-fusion - Enable LUI+ADDI macro fusion.
+; CHECK-NEXT: lui-load-fusion - Enable LUI + load macrofusion.
; CHECK-NEXT: m - 'M' (Integer Multiplication and Division).
; CHECK-NEXT: mips-p8700 - MIPS p8700 processor.
; CHECK-NEXT: no-default-unroll - Disable default unroll preference..
@@ -130,6 +139,7 @@
; CHECK-NEXT: shvsatpa - 'Shvsatpa' (vsatp supports all modes supported by satp).
; CHECK-NEXT: shvstvala - 'Shvstvala' (vstval provides all needed values).
; CHECK-NEXT: shvstvecd - 'Shvstvecd' (vstvec supports Direct mode).
+; CHECK-NEXT: shxadd-load-fusion - Enable SH(1|2|3)ADD(.UW) + load macrofusion.
; CHECK-NEXT: sifive7 - SiFive 7-Series processors.
; CHECK-NEXT: smaia - 'Smaia' (Advanced Interrupt Architecture Machine Level).
; CHECK-NEXT: smcdeleg - 'Smcdeleg' (Counter Delegation Machine Level).
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 117e3e4..519f1e8 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -1110,15 +1110,15 @@ define i64 @stest_f64i64(double %x) {
; RV32IF-NEXT: .LBB18_3: # %entry
; RV32IF-NEXT: slti a6, a4, 0
; RV32IF-NEXT: .LBB18_4: # %entry
-; RV32IF-NEXT: addi a7, a6, -1
-; RV32IF-NEXT: neg t0, a6
+; RV32IF-NEXT: neg a7, a6
+; RV32IF-NEXT: addi t0, a6, -1
; RV32IF-NEXT: bnez a6, .LBB18_6
; RV32IF-NEXT: # %bb.5: # %entry
; RV32IF-NEXT: mv a1, a5
; RV32IF-NEXT: .LBB18_6: # %entry
-; RV32IF-NEXT: or a3, a7, a3
-; RV32IF-NEXT: and a4, t0, a4
-; RV32IF-NEXT: and a2, t0, a2
+; RV32IF-NEXT: or a3, t0, a3
+; RV32IF-NEXT: and a4, a7, a4
+; RV32IF-NEXT: and a2, a7, a2
; RV32IF-NEXT: beq a1, a0, .LBB18_8
; RV32IF-NEXT: # %bb.7: # %entry
; RV32IF-NEXT: sltu a0, a0, a1
@@ -1213,15 +1213,15 @@ define i64 @stest_f64i64(double %x) {
; RV32IFD-NEXT: .LBB18_3: # %entry
; RV32IFD-NEXT: slti a6, a4, 0
; RV32IFD-NEXT: .LBB18_4: # %entry
-; RV32IFD-NEXT: addi a7, a6, -1
-; RV32IFD-NEXT: neg t0, a6
+; RV32IFD-NEXT: neg a7, a6
+; RV32IFD-NEXT: addi t0, a6, -1
; RV32IFD-NEXT: bnez a6, .LBB18_6
; RV32IFD-NEXT: # %bb.5: # %entry
; RV32IFD-NEXT: mv a1, a5
; RV32IFD-NEXT: .LBB18_6: # %entry
-; RV32IFD-NEXT: or a3, a7, a3
-; RV32IFD-NEXT: and a4, t0, a4
-; RV32IFD-NEXT: and a2, t0, a2
+; RV32IFD-NEXT: or a3, t0, a3
+; RV32IFD-NEXT: and a4, a7, a4
+; RV32IFD-NEXT: and a2, a7, a2
; RV32IFD-NEXT: beq a1, a0, .LBB18_8
; RV32IFD-NEXT: # %bb.7: # %entry
; RV32IFD-NEXT: sltu a0, a0, a1
@@ -1378,8 +1378,8 @@ define i64 @ustest_f64i64(double %x) {
; RV32IF-NEXT: # %bb.4: # %entry
; RV32IF-NEXT: li a0, 1
; RV32IF-NEXT: .LBB20_5: # %entry
-; RV32IF-NEXT: lw a3, 8(sp)
-; RV32IF-NEXT: lw a4, 12(sp)
+; RV32IF-NEXT: lw a4, 8(sp)
+; RV32IF-NEXT: lw a3, 12(sp)
; RV32IF-NEXT: and a5, a2, a1
; RV32IF-NEXT: beqz a5, .LBB20_7
; RV32IF-NEXT: # %bb.6: # %entry
@@ -1393,12 +1393,12 @@ define i64 @ustest_f64i64(double %x) {
; RV32IF-NEXT: and a2, a2, a3
; RV32IF-NEXT: bnez a0, .LBB20_10
; RV32IF-NEXT: # %bb.9:
-; RV32IF-NEXT: or a0, a2, a4
+; RV32IF-NEXT: or a0, a4, a2
; RV32IF-NEXT: snez a1, a0
; RV32IF-NEXT: .LBB20_10: # %entry
; RV32IF-NEXT: neg a1, a1
-; RV32IF-NEXT: and a0, a1, a2
-; RV32IF-NEXT: and a1, a1, a4
+; RV32IF-NEXT: and a0, a1, a4
+; RV32IF-NEXT: and a1, a1, a2
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: .cfi_restore ra
; RV32IF-NEXT: addi sp, sp, 32
@@ -1461,8 +1461,8 @@ define i64 @ustest_f64i64(double %x) {
; RV32IFD-NEXT: # %bb.4: # %entry
; RV32IFD-NEXT: li a0, 1
; RV32IFD-NEXT: .LBB20_5: # %entry
-; RV32IFD-NEXT: lw a3, 8(sp)
-; RV32IFD-NEXT: lw a4, 12(sp)
+; RV32IFD-NEXT: lw a4, 8(sp)
+; RV32IFD-NEXT: lw a3, 12(sp)
; RV32IFD-NEXT: and a5, a2, a1
; RV32IFD-NEXT: beqz a5, .LBB20_7
; RV32IFD-NEXT: # %bb.6: # %entry
@@ -1476,12 +1476,12 @@ define i64 @ustest_f64i64(double %x) {
; RV32IFD-NEXT: and a2, a2, a3
; RV32IFD-NEXT: bnez a0, .LBB20_10
; RV32IFD-NEXT: # %bb.9:
-; RV32IFD-NEXT: or a0, a2, a4
+; RV32IFD-NEXT: or a0, a4, a2
; RV32IFD-NEXT: snez a1, a0
; RV32IFD-NEXT: .LBB20_10: # %entry
; RV32IFD-NEXT: neg a1, a1
-; RV32IFD-NEXT: and a0, a1, a2
-; RV32IFD-NEXT: and a1, a1, a4
+; RV32IFD-NEXT: and a0, a1, a4
+; RV32IFD-NEXT: and a1, a1, a2
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: .cfi_restore ra
; RV32IFD-NEXT: addi sp, sp, 32
@@ -1525,15 +1525,15 @@ define i64 @stest_f32i64(float %x) {
; RV32-NEXT: .LBB21_3: # %entry
; RV32-NEXT: slti a6, a4, 0
; RV32-NEXT: .LBB21_4: # %entry
-; RV32-NEXT: addi a7, a6, -1
-; RV32-NEXT: neg t0, a6
+; RV32-NEXT: neg a7, a6
+; RV32-NEXT: addi t0, a6, -1
; RV32-NEXT: bnez a6, .LBB21_6
; RV32-NEXT: # %bb.5: # %entry
; RV32-NEXT: mv a1, a5
; RV32-NEXT: .LBB21_6: # %entry
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: and a4, t0, a4
-; RV32-NEXT: and a2, t0, a2
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: and a4, a7, a4
+; RV32-NEXT: and a2, a7, a2
; RV32-NEXT: beq a1, a0, .LBB21_8
; RV32-NEXT: # %bb.7: # %entry
; RV32-NEXT: sltu a0, a0, a1
@@ -1658,8 +1658,8 @@ define i64 @ustest_f32i64(float %x) {
; RV32-NEXT: # %bb.4: # %entry
; RV32-NEXT: li a0, 1
; RV32-NEXT: .LBB23_5: # %entry
-; RV32-NEXT: lw a3, 8(sp)
-; RV32-NEXT: lw a4, 12(sp)
+; RV32-NEXT: lw a4, 8(sp)
+; RV32-NEXT: lw a3, 12(sp)
; RV32-NEXT: and a5, a2, a1
; RV32-NEXT: beqz a5, .LBB23_7
; RV32-NEXT: # %bb.6: # %entry
@@ -1673,12 +1673,12 @@ define i64 @ustest_f32i64(float %x) {
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: bnez a0, .LBB23_10
; RV32-NEXT: # %bb.9:
-; RV32-NEXT: or a0, a2, a4
+; RV32-NEXT: or a0, a4, a2
; RV32-NEXT: snez a1, a0
; RV32-NEXT: .LBB23_10: # %entry
; RV32-NEXT: neg a1, a1
-; RV32-NEXT: and a0, a1, a2
-; RV32-NEXT: and a1, a1, a4
+; RV32-NEXT: and a0, a1, a4
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -1752,15 +1752,15 @@ define i64 @stest_f16i64(half %x) {
; RV32-NEXT: .LBB24_3: # %entry
; RV32-NEXT: slti a6, a4, 0
; RV32-NEXT: .LBB24_4: # %entry
-; RV32-NEXT: addi a7, a6, -1
-; RV32-NEXT: neg t0, a6
+; RV32-NEXT: neg a7, a6
+; RV32-NEXT: addi t0, a6, -1
; RV32-NEXT: bnez a6, .LBB24_6
; RV32-NEXT: # %bb.5: # %entry
; RV32-NEXT: mv a1, a5
; RV32-NEXT: .LBB24_6: # %entry
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: and a4, t0, a4
-; RV32-NEXT: and a2, t0, a2
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: and a4, a7, a4
+; RV32-NEXT: and a2, a7, a2
; RV32-NEXT: beq a1, a0, .LBB24_8
; RV32-NEXT: # %bb.7: # %entry
; RV32-NEXT: sltu a0, a0, a1
@@ -1921,8 +1921,8 @@ define i64 @ustest_f16i64(half %x) {
; RV32-NEXT: # %bb.4: # %entry
; RV32-NEXT: li a0, 1
; RV32-NEXT: .LBB26_5: # %entry
-; RV32-NEXT: lw a3, 8(sp)
-; RV32-NEXT: lw a4, 12(sp)
+; RV32-NEXT: lw a4, 8(sp)
+; RV32-NEXT: lw a3, 12(sp)
; RV32-NEXT: and a5, a2, a1
; RV32-NEXT: beqz a5, .LBB26_7
; RV32-NEXT: # %bb.6: # %entry
@@ -1936,12 +1936,12 @@ define i64 @ustest_f16i64(half %x) {
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: bnez a0, .LBB26_10
; RV32-NEXT: # %bb.9:
-; RV32-NEXT: or a0, a2, a4
+; RV32-NEXT: or a0, a4, a2
; RV32-NEXT: snez a1, a0
; RV32-NEXT: .LBB26_10: # %entry
; RV32-NEXT: neg a1, a1
-; RV32-NEXT: and a0, a1, a2
-; RV32-NEXT: and a1, a1, a4
+; RV32-NEXT: and a0, a1, a4
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -3046,15 +3046,15 @@ define i64 @stest_f64i64_mm(double %x) {
; RV32IF-NEXT: .LBB45_3: # %entry
; RV32IF-NEXT: slti a6, a4, 0
; RV32IF-NEXT: .LBB45_4: # %entry
-; RV32IF-NEXT: addi a7, a6, -1
-; RV32IF-NEXT: neg t0, a6
+; RV32IF-NEXT: neg a7, a6
+; RV32IF-NEXT: addi t0, a6, -1
; RV32IF-NEXT: bnez a6, .LBB45_6
; RV32IF-NEXT: # %bb.5: # %entry
; RV32IF-NEXT: mv a1, a5
; RV32IF-NEXT: .LBB45_6: # %entry
-; RV32IF-NEXT: or a3, a7, a3
-; RV32IF-NEXT: and a4, t0, a4
-; RV32IF-NEXT: and a2, t0, a2
+; RV32IF-NEXT: or a3, t0, a3
+; RV32IF-NEXT: and a4, a7, a4
+; RV32IF-NEXT: and a2, a7, a2
; RV32IF-NEXT: beq a1, a0, .LBB45_8
; RV32IF-NEXT: # %bb.7: # %entry
; RV32IF-NEXT: sltu a0, a0, a1
@@ -3149,15 +3149,15 @@ define i64 @stest_f64i64_mm(double %x) {
; RV32IFD-NEXT: .LBB45_3: # %entry
; RV32IFD-NEXT: slti a6, a4, 0
; RV32IFD-NEXT: .LBB45_4: # %entry
-; RV32IFD-NEXT: addi a7, a6, -1
-; RV32IFD-NEXT: neg t0, a6
+; RV32IFD-NEXT: neg a7, a6
+; RV32IFD-NEXT: addi t0, a6, -1
; RV32IFD-NEXT: bnez a6, .LBB45_6
; RV32IFD-NEXT: # %bb.5: # %entry
; RV32IFD-NEXT: mv a1, a5
; RV32IFD-NEXT: .LBB45_6: # %entry
-; RV32IFD-NEXT: or a3, a7, a3
-; RV32IFD-NEXT: and a4, t0, a4
-; RV32IFD-NEXT: and a2, t0, a2
+; RV32IFD-NEXT: or a3, t0, a3
+; RV32IFD-NEXT: and a4, a7, a4
+; RV32IFD-NEXT: and a2, a7, a2
; RV32IFD-NEXT: beq a1, a0, .LBB45_8
; RV32IFD-NEXT: # %bb.7: # %entry
; RV32IFD-NEXT: sltu a0, a0, a1
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IF-NEXT: mv a1, a0
; RV32IF-NEXT: addi a0, sp, 8
; RV32IF-NEXT: call __fixdfti
-; RV32IF-NEXT: lw a0, 20(sp)
-; RV32IF-NEXT: lw a1, 8(sp)
-; RV32IF-NEXT: lw a2, 12(sp)
+; RV32IF-NEXT: lw a0, 8(sp)
+; RV32IF-NEXT: lw a1, 12(sp)
+; RV32IF-NEXT: lw a2, 20(sp)
; RV32IF-NEXT: lw a3, 16(sp)
-; RV32IF-NEXT: beqz a0, .LBB47_2
+; RV32IF-NEXT: beqz a2, .LBB47_2
; RV32IF-NEXT: # %bb.1: # %entry
-; RV32IF-NEXT: slti a4, a0, 0
+; RV32IF-NEXT: slti a4, a2, 0
; RV32IF-NEXT: j .LBB47_3
; RV32IF-NEXT: .LBB47_2:
; RV32IF-NEXT: seqz a4, a3
; RV32IF-NEXT: .LBB47_3: # %entry
; RV32IF-NEXT: xori a3, a3, 1
-; RV32IF-NEXT: or a3, a3, a0
+; RV32IF-NEXT: or a3, a3, a2
; RV32IF-NEXT: seqz a3, a3
; RV32IF-NEXT: addi a3, a3, -1
; RV32IF-NEXT: and a3, a3, a4
; RV32IF-NEXT: neg a3, a3
-; RV32IF-NEXT: and a2, a3, a2
; RV32IF-NEXT: and a1, a3, a1
; RV32IF-NEXT: and a0, a3, a0
-; RV32IF-NEXT: slti a0, a0, 0
-; RV32IF-NEXT: addi a3, a0, -1
-; RV32IF-NEXT: and a0, a3, a1
-; RV32IF-NEXT: and a1, a3, a2
+; RV32IF-NEXT: and a2, a3, a2
+; RV32IF-NEXT: slti a2, a2, 0
+; RV32IF-NEXT: addi a2, a2, -1
+; RV32IF-NEXT: and a0, a2, a0
+; RV32IF-NEXT: and a1, a2, a1
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: .cfi_restore ra
; RV32IF-NEXT: addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
; RV32IFD-NEXT: .cfi_offset ra, -4
; RV32IFD-NEXT: addi a0, sp, 8
; RV32IFD-NEXT: call __fixdfti
-; RV32IFD-NEXT: lw a0, 20(sp)
-; RV32IFD-NEXT: lw a1, 8(sp)
-; RV32IFD-NEXT: lw a2, 12(sp)
+; RV32IFD-NEXT: lw a0, 8(sp)
+; RV32IFD-NEXT: lw a1, 12(sp)
+; RV32IFD-NEXT: lw a2, 20(sp)
; RV32IFD-NEXT: lw a3, 16(sp)
-; RV32IFD-NEXT: beqz a0, .LBB47_2
+; RV32IFD-NEXT: beqz a2, .LBB47_2
; RV32IFD-NEXT: # %bb.1: # %entry
-; RV32IFD-NEXT: slti a4, a0, 0
+; RV32IFD-NEXT: slti a4, a2, 0
; RV32IFD-NEXT: j .LBB47_3
; RV32IFD-NEXT: .LBB47_2:
; RV32IFD-NEXT: seqz a4, a3
; RV32IFD-NEXT: .LBB47_3: # %entry
; RV32IFD-NEXT: xori a3, a3, 1
-; RV32IFD-NEXT: or a3, a3, a0
+; RV32IFD-NEXT: or a3, a3, a2
; RV32IFD-NEXT: seqz a3, a3
; RV32IFD-NEXT: addi a3, a3, -1
; RV32IFD-NEXT: and a3, a3, a4
; RV32IFD-NEXT: neg a3, a3
-; RV32IFD-NEXT: and a2, a3, a2
; RV32IFD-NEXT: and a1, a3, a1
; RV32IFD-NEXT: and a0, a3, a0
-; RV32IFD-NEXT: slti a0, a0, 0
-; RV32IFD-NEXT: addi a3, a0, -1
-; RV32IFD-NEXT: and a0, a3, a1
-; RV32IFD-NEXT: and a1, a3, a2
+; RV32IFD-NEXT: and a2, a3, a2
+; RV32IFD-NEXT: slti a2, a2, 0
+; RV32IFD-NEXT: addi a2, a2, -1
+; RV32IFD-NEXT: and a0, a2, a0
+; RV32IFD-NEXT: and a1, a2, a1
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: .cfi_restore ra
; RV32IFD-NEXT: addi sp, sp, 32
@@ -3419,15 +3419,15 @@ define i64 @stest_f32i64_mm(float %x) {
; RV32-NEXT: .LBB48_3: # %entry
; RV32-NEXT: slti a6, a4, 0
; RV32-NEXT: .LBB48_4: # %entry
-; RV32-NEXT: addi a7, a6, -1
-; RV32-NEXT: neg t0, a6
+; RV32-NEXT: neg a7, a6
+; RV32-NEXT: addi t0, a6, -1
; RV32-NEXT: bnez a6, .LBB48_6
; RV32-NEXT: # %bb.5: # %entry
; RV32-NEXT: mv a1, a5
; RV32-NEXT: .LBB48_6: # %entry
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: and a4, t0, a4
-; RV32-NEXT: and a2, t0, a2
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: and a4, a7, a4
+; RV32-NEXT: and a2, a7, a2
; RV32-NEXT: beq a1, a0, .LBB48_8
; RV32-NEXT: # %bb.7: # %entry
; RV32-NEXT: sltu a0, a0, a1
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
; RV32-NEXT: .cfi_offset ra, -4
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 20(sp)
-; RV32-NEXT: lw a1, 8(sp)
-; RV32-NEXT: lw a2, 12(sp)
+; RV32-NEXT: lw a0, 8(sp)
+; RV32-NEXT: lw a1, 12(sp)
+; RV32-NEXT: lw a2, 20(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a0, .LBB50_2
+; RV32-NEXT: beqz a2, .LBB50_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a0, 0
+; RV32-NEXT: slti a4, a2, 0
; RV32-NEXT: j .LBB50_3
; RV32-NEXT: .LBB50_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB50_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a0
+; RV32-NEXT: or a3, a3, a2
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
-; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: slti a0, a0, 0
-; RV32-NEXT: addi a3, a0, -1
-; RV32-NEXT: and a0, a3, a1
-; RV32-NEXT: and a1, a3, a2
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: slti a2, a2, 0
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a2, a0
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
@@ -3620,15 +3620,15 @@ define i64 @stest_f16i64_mm(half %x) {
; RV32-NEXT: .LBB51_3: # %entry
; RV32-NEXT: slti a6, a4, 0
; RV32-NEXT: .LBB51_4: # %entry
-; RV32-NEXT: addi a7, a6, -1
-; RV32-NEXT: neg t0, a6
+; RV32-NEXT: neg a7, a6
+; RV32-NEXT: addi t0, a6, -1
; RV32-NEXT: bnez a6, .LBB51_6
; RV32-NEXT: # %bb.5: # %entry
; RV32-NEXT: mv a1, a5
; RV32-NEXT: .LBB51_6: # %entry
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: and a4, t0, a4
-; RV32-NEXT: and a2, t0, a2
+; RV32-NEXT: or a3, t0, a3
+; RV32-NEXT: and a4, a7, a4
+; RV32-NEXT: and a2, a7, a2
; RV32-NEXT: beq a1, a0, .LBB51_8
; RV32-NEXT: # %bb.7: # %entry
; RV32-NEXT: sltu a0, a0, a1
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
; RV32-NEXT: call __extendhfsf2
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: call __fixsfti
-; RV32-NEXT: lw a0, 20(sp)
-; RV32-NEXT: lw a1, 8(sp)
-; RV32-NEXT: lw a2, 12(sp)
+; RV32-NEXT: lw a0, 8(sp)
+; RV32-NEXT: lw a1, 12(sp)
+; RV32-NEXT: lw a2, 20(sp)
; RV32-NEXT: lw a3, 16(sp)
-; RV32-NEXT: beqz a0, .LBB53_2
+; RV32-NEXT: beqz a2, .LBB53_2
; RV32-NEXT: # %bb.1: # %entry
-; RV32-NEXT: slti a4, a0, 0
+; RV32-NEXT: slti a4, a2, 0
; RV32-NEXT: j .LBB53_3
; RV32-NEXT: .LBB53_2:
; RV32-NEXT: seqz a4, a3
; RV32-NEXT: .LBB53_3: # %entry
; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a3, a3, a0
+; RV32-NEXT: or a3, a3, a2
; RV32-NEXT: seqz a3, a3
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: and a3, a3, a4
; RV32-NEXT: neg a3, a3
-; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: and a1, a3, a1
; RV32-NEXT: and a0, a3, a0
-; RV32-NEXT: slti a0, a0, 0
-; RV32-NEXT: addi a3, a0, -1
-; RV32-NEXT: and a0, a3, a1
-; RV32-NEXT: and a1, a3, a2
+; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: slti a2, a2, 0
+; RV32-NEXT: addi a2, a2, -1
+; RV32-NEXT: and a0, a2, a0
+; RV32-NEXT: and a1, a2, a1
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: .cfi_restore ra
; RV32-NEXT: addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index facb544..0c152e6 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -2262,12 +2262,12 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
; RV32IZHINX-NEXT: addi a2, a3, -1
; RV32IZHINX-NEXT: .LBB10_4: # %start
; RV32IZHINX-NEXT: feq.s a3, s0, s0
-; RV32IZHINX-NEXT: neg a4, a1
-; RV32IZHINX-NEXT: neg a1, s1
+; RV32IZHINX-NEXT: neg a4, s1
+; RV32IZHINX-NEXT: neg a5, a1
; RV32IZHINX-NEXT: neg a3, a3
-; RV32IZHINX-NEXT: and a0, a1, a0
+; RV32IZHINX-NEXT: and a0, a4, a0
; RV32IZHINX-NEXT: and a1, a3, a2
-; RV32IZHINX-NEXT: or a0, a4, a0
+; RV32IZHINX-NEXT: or a0, a5, a0
; RV32IZHINX-NEXT: and a0, a3, a0
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
@@ -2309,12 +2309,12 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
; RV32IZDINXZHINX-NEXT: addi a2, a3, -1
; RV32IZDINXZHINX-NEXT: .LBB10_4: # %start
; RV32IZDINXZHINX-NEXT: feq.s a3, s0, s0
-; RV32IZDINXZHINX-NEXT: neg a4, a1
-; RV32IZDINXZHINX-NEXT: neg a1, s1
+; RV32IZDINXZHINX-NEXT: neg a4, s1
+; RV32IZDINXZHINX-NEXT: neg a5, a1
; RV32IZDINXZHINX-NEXT: neg a3, a3
-; RV32IZDINXZHINX-NEXT: and a0, a1, a0
+; RV32IZDINXZHINX-NEXT: and a0, a4, a0
; RV32IZDINXZHINX-NEXT: and a1, a3, a2
-; RV32IZDINXZHINX-NEXT: or a0, a4, a0
+; RV32IZDINXZHINX-NEXT: or a0, a5, a0
; RV32IZDINXZHINX-NEXT: and a0, a3, a0
; RV32IZDINXZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZDINXZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
@@ -2653,12 +2653,12 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
; CHECK32-IZHINXMIN-NEXT: addi a2, a3, -1
; CHECK32-IZHINXMIN-NEXT: .LBB10_4: # %start
; CHECK32-IZHINXMIN-NEXT: feq.s a3, s0, s0
-; CHECK32-IZHINXMIN-NEXT: neg a4, a1
-; CHECK32-IZHINXMIN-NEXT: neg a1, s1
+; CHECK32-IZHINXMIN-NEXT: neg a4, s1
+; CHECK32-IZHINXMIN-NEXT: neg a5, a1
; CHECK32-IZHINXMIN-NEXT: neg a3, a3
-; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT: and a0, a4, a0
; CHECK32-IZHINXMIN-NEXT: and a1, a3, a2
-; CHECK32-IZHINXMIN-NEXT: or a0, a4, a0
+; CHECK32-IZHINXMIN-NEXT: or a0, a5, a0
; CHECK32-IZHINXMIN-NEXT: and a0, a3, a0
; CHECK32-IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; CHECK32-IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
@@ -2701,12 +2701,12 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
; CHECK32-IZDINXZHINXMIN-NEXT: addi a2, a3, -1
; CHECK32-IZDINXZHINXMIN-NEXT: .LBB10_4: # %start
; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a3, s0, s0
-; CHECK32-IZDINXZHINXMIN-NEXT: neg a4, a1
-; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, s1
+; CHECK32-IZDINXZHINXMIN-NEXT: neg a4, s1
+; CHECK32-IZDINXZHINXMIN-NEXT: neg a5, a1
; CHECK32-IZDINXZHINXMIN-NEXT: neg a3, a3
-; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a4, a0
; CHECK32-IZDINXZHINXMIN-NEXT: and a1, a3, a2
-; CHECK32-IZDINXZHINXMIN-NEXT: or a0, a4, a0
+; CHECK32-IZDINXZHINXMIN-NEXT: or a0, a5, a0
; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a3, a0
; CHECK32-IZDINXZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; CHECK32-IZDINXZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
@@ -2972,18 +2972,19 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
; RV32IZHINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZHINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32IZHINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZHINX-NEXT: fcvt.s.h a0, a0
-; RV32IZHINX-NEXT: lui a1, 391168
-; RV32IZHINX-NEXT: addi a1, a1, -1
-; RV32IZHINX-NEXT: fle.s a2, zero, a0
-; RV32IZHINX-NEXT: flt.s a1, a1, a0
-; RV32IZHINX-NEXT: neg s0, a1
-; RV32IZHINX-NEXT: neg s1, a2
+; RV32IZHINX-NEXT: fcvt.s.h s0, a0
+; RV32IZHINX-NEXT: fle.s a0, zero, s0
+; RV32IZHINX-NEXT: neg s1, a0
+; RV32IZHINX-NEXT: mv a0, s0
; RV32IZHINX-NEXT: call __fixunssfdi
; RV32IZHINX-NEXT: and a0, s1, a0
+; RV32IZHINX-NEXT: lui a2, 391168
; RV32IZHINX-NEXT: and a1, s1, a1
-; RV32IZHINX-NEXT: or a0, s0, a0
-; RV32IZHINX-NEXT: or a1, s0, a1
+; RV32IZHINX-NEXT: addi a2, a2, -1
+; RV32IZHINX-NEXT: flt.s a2, a2, s0
+; RV32IZHINX-NEXT: neg a2, a2
+; RV32IZHINX-NEXT: or a0, a2, a0
+; RV32IZHINX-NEXT: or a1, a2, a1
; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -3005,18 +3006,19 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
; RV32IZDINXZHINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IZDINXZHINX-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32IZDINXZHINX-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZDINXZHINX-NEXT: fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT: lui a1, 391168
-; RV32IZDINXZHINX-NEXT: addi a1, a1, -1
-; RV32IZDINXZHINX-NEXT: fle.s a2, zero, a0
-; RV32IZDINXZHINX-NEXT: flt.s a1, a1, a0
-; RV32IZDINXZHINX-NEXT: neg s0, a1
-; RV32IZDINXZHINX-NEXT: neg s1, a2
+; RV32IZDINXZHINX-NEXT: fcvt.s.h s0, a0
+; RV32IZDINXZHINX-NEXT: fle.s a0, zero, s0
+; RV32IZDINXZHINX-NEXT: neg s1, a0
+; RV32IZDINXZHINX-NEXT: mv a0, s0
; RV32IZDINXZHINX-NEXT: call __fixunssfdi
; RV32IZDINXZHINX-NEXT: and a0, s1, a0
+; RV32IZDINXZHINX-NEXT: lui a2, 391168
; RV32IZDINXZHINX-NEXT: and a1, s1, a1
-; RV32IZDINXZHINX-NEXT: or a0, s0, a0
-; RV32IZDINXZHINX-NEXT: or a1, s0, a1
+; RV32IZDINXZHINX-NEXT: addi a2, a2, -1
+; RV32IZDINXZHINX-NEXT: flt.s a2, a2, s0
+; RV32IZDINXZHINX-NEXT: neg a2, a2
+; RV32IZDINXZHINX-NEXT: or a0, a2, a0
+; RV32IZDINXZHINX-NEXT: or a1, a2, a1
; RV32IZDINXZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IZDINXZHINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32IZDINXZHINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -3217,18 +3219,19 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
; CHECK32-IZHINXMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; CHECK32-IZHINXMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; CHECK32-IZHINXMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT: lui a1, 391168
-; CHECK32-IZHINXMIN-NEXT: addi a1, a1, -1
-; CHECK32-IZHINXMIN-NEXT: fle.s a2, zero, a0
-; CHECK32-IZHINXMIN-NEXT: flt.s a1, a1, a0
-; CHECK32-IZHINXMIN-NEXT: neg s0, a1
-; CHECK32-IZHINXMIN-NEXT: neg s1, a2
+; CHECK32-IZHINXMIN-NEXT: fcvt.s.h s0, a0
+; CHECK32-IZHINXMIN-NEXT: fle.s a0, zero, s0
+; CHECK32-IZHINXMIN-NEXT: neg s1, a0
+; CHECK32-IZHINXMIN-NEXT: mv a0, s0
; CHECK32-IZHINXMIN-NEXT: call __fixunssfdi
; CHECK32-IZHINXMIN-NEXT: and a0, s1, a0
+; CHECK32-IZHINXMIN-NEXT: lui a2, 391168
; CHECK32-IZHINXMIN-NEXT: and a1, s1, a1
-; CHECK32-IZHINXMIN-NEXT: or a0, s0, a0
-; CHECK32-IZHINXMIN-NEXT: or a1, s0, a1
+; CHECK32-IZHINXMIN-NEXT: addi a2, a2, -1
+; CHECK32-IZHINXMIN-NEXT: flt.s a2, a2, s0
+; CHECK32-IZHINXMIN-NEXT: neg a2, a2
+; CHECK32-IZHINXMIN-NEXT: or a0, a2, a0
+; CHECK32-IZHINXMIN-NEXT: or a1, a2, a1
; CHECK32-IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; CHECK32-IZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; CHECK32-IZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -3251,18 +3254,19 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
; CHECK32-IZDINXZHINXMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; CHECK32-IZDINXZHINXMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; CHECK32-IZDINXZHINXMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
-; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT: lui a1, 391168
-; CHECK32-IZDINXZHINXMIN-NEXT: addi a1, a1, -1
-; CHECK32-IZDINXZHINXMIN-NEXT: fle.s a2, zero, a0
-; CHECK32-IZDINXZHINXMIN-NEXT: flt.s a1, a1, a0
-; CHECK32-IZDINXZHINXMIN-NEXT: neg s0, a1
-; CHECK32-IZDINXZHINXMIN-NEXT: neg s1, a2
+; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h s0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT: fle.s a0, zero, s0
+; CHECK32-IZDINXZHINXMIN-NEXT: neg s1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT: mv a0, s0
; CHECK32-IZDINXZHINXMIN-NEXT: call __fixunssfdi
; CHECK32-IZDINXZHINXMIN-NEXT: and a0, s1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT: lui a2, 391168
; CHECK32-IZDINXZHINXMIN-NEXT: and a1, s1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT: or a0, s0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT: or a1, s0, a1
+; CHECK32-IZDINXZHINXMIN-NEXT: addi a2, a2, -1
+; CHECK32-IZDINXZHINXMIN-NEXT: flt.s a2, a2, s0
+; CHECK32-IZDINXZHINXMIN-NEXT: neg a2, a2
+; CHECK32-IZDINXZHINXMIN-NEXT: or a0, a2, a0
+; CHECK32-IZDINXZHINXMIN-NEXT: or a1, a2, a1
; CHECK32-IZDINXZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; CHECK32-IZDINXZHINXMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; CHECK32-IZDINXZHINXMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 774f1a1..c157c63 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) {
define i128 @abs128(i128 %x) {
; RV32I-LABEL: abs128:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a3, 12(a1)
-; RV32I-NEXT: lw a2, 0(a1)
+; RV32I-NEXT: lw a2, 12(a1)
+; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a1, 8(a1)
-; RV32I-NEXT: bgez a3, .LBB8_2
+; RV32I-NEXT: bgez a2, .LBB8_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: neg a5, a1
; RV32I-NEXT: snez a6, a4
-; RV32I-NEXT: snez a7, a2
+; RV32I-NEXT: snez a7, a3
; RV32I-NEXT: snez a1, a1
; RV32I-NEXT: neg a4, a4
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: add a1, a3, a1
+; RV32I-NEXT: add a1, a2, a1
; RV32I-NEXT: sub a4, a4, a7
-; RV32I-NEXT: sltu a3, a5, a6
+; RV32I-NEXT: sltu a2, a5, a6
; RV32I-NEXT: neg a7, a1
; RV32I-NEXT: sub a1, a5, a6
-; RV32I-NEXT: sub a3, a7, a3
-; RV32I-NEXT: neg a2, a2
+; RV32I-NEXT: sub a2, a7, a2
+; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: .LBB8_2:
-; RV32I-NEXT: sw a2, 0(a0)
+; RV32I-NEXT: sw a3, 0(a0)
; RV32I-NEXT: sw a4, 4(a0)
; RV32I-NEXT: sw a1, 8(a0)
-; RV32I-NEXT: sw a3, 12(a0)
+; RV32I-NEXT: sw a2, 12(a0)
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: abs128:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a3, 12(a1)
-; RV32ZBB-NEXT: lw a2, 0(a1)
+; RV32ZBB-NEXT: lw a2, 12(a1)
+; RV32ZBB-NEXT: lw a3, 0(a1)
; RV32ZBB-NEXT: lw a4, 4(a1)
; RV32ZBB-NEXT: lw a1, 8(a1)
-; RV32ZBB-NEXT: bgez a3, .LBB8_2
+; RV32ZBB-NEXT: bgez a2, .LBB8_2
; RV32ZBB-NEXT: # %bb.1:
; RV32ZBB-NEXT: neg a5, a1
; RV32ZBB-NEXT: snez a6, a4
-; RV32ZBB-NEXT: snez a7, a2
+; RV32ZBB-NEXT: snez a7, a3
; RV32ZBB-NEXT: snez a1, a1
; RV32ZBB-NEXT: neg a4, a4
; RV32ZBB-NEXT: or a6, a7, a6
-; RV32ZBB-NEXT: add a1, a3, a1
+; RV32ZBB-NEXT: add a1, a2, a1
; RV32ZBB-NEXT: sub a4, a4, a7
-; RV32ZBB-NEXT: sltu a3, a5, a6
+; RV32ZBB-NEXT: sltu a2, a5, a6
; RV32ZBB-NEXT: neg a7, a1
; RV32ZBB-NEXT: sub a1, a5, a6
-; RV32ZBB-NEXT: sub a3, a7, a3
-; RV32ZBB-NEXT: neg a2, a2
+; RV32ZBB-NEXT: sub a2, a7, a2
+; RV32ZBB-NEXT: neg a3, a3
; RV32ZBB-NEXT: .LBB8_2:
-; RV32ZBB-NEXT: sw a2, 0(a0)
+; RV32ZBB-NEXT: sw a3, 0(a0)
; RV32ZBB-NEXT: sw a4, 4(a0)
; RV32ZBB-NEXT: sw a1, 8(a0)
-; RV32ZBB-NEXT: sw a3, 12(a0)
+; RV32ZBB-NEXT: sw a2, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64I-LABEL: abs128:
@@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) {
define i128 @select_abs128(i128 %x) {
; RV32I-LABEL: select_abs128:
; RV32I: # %bb.0:
-; RV32I-NEXT: lw a3, 12(a1)
-; RV32I-NEXT: lw a2, 0(a1)
+; RV32I-NEXT: lw a2, 12(a1)
+; RV32I-NEXT: lw a3, 0(a1)
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a1, 8(a1)
-; RV32I-NEXT: bgez a3, .LBB9_2
+; RV32I-NEXT: bgez a2, .LBB9_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: neg a5, a1
; RV32I-NEXT: snez a6, a4
-; RV32I-NEXT: snez a7, a2
+; RV32I-NEXT: snez a7, a3
; RV32I-NEXT: snez a1, a1
; RV32I-NEXT: neg a4, a4
; RV32I-NEXT: or a6, a7, a6
-; RV32I-NEXT: add a1, a3, a1
+; RV32I-NEXT: add a1, a2, a1
; RV32I-NEXT: sub a4, a4, a7
-; RV32I-NEXT: sltu a3, a5, a6
+; RV32I-NEXT: sltu a2, a5, a6
; RV32I-NEXT: neg a7, a1
; RV32I-NEXT: sub a1, a5, a6
-; RV32I-NEXT: sub a3, a7, a3
-; RV32I-NEXT: neg a2, a2
+; RV32I-NEXT: sub a2, a7, a2
+; RV32I-NEXT: neg a3, a3
; RV32I-NEXT: .LBB9_2:
-; RV32I-NEXT: sw a2, 0(a0)
+; RV32I-NEXT: sw a3, 0(a0)
; RV32I-NEXT: sw a4, 4(a0)
; RV32I-NEXT: sw a1, 8(a0)
-; RV32I-NEXT: sw a3, 12(a0)
+; RV32I-NEXT: sw a2, 12(a0)
; RV32I-NEXT: ret
;
; RV32ZBB-LABEL: select_abs128:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: lw a3, 12(a1)
-; RV32ZBB-NEXT: lw a2, 0(a1)
+; RV32ZBB-NEXT: lw a2, 12(a1)
+; RV32ZBB-NEXT: lw a3, 0(a1)
; RV32ZBB-NEXT: lw a4, 4(a1)
; RV32ZBB-NEXT: lw a1, 8(a1)
-; RV32ZBB-NEXT: bgez a3, .LBB9_2
+; RV32ZBB-NEXT: bgez a2, .LBB9_2
; RV32ZBB-NEXT: # %bb.1:
; RV32ZBB-NEXT: neg a5, a1
; RV32ZBB-NEXT: snez a6, a4
-; RV32ZBB-NEXT: snez a7, a2
+; RV32ZBB-NEXT: snez a7, a3
; RV32ZBB-NEXT: snez a1, a1
; RV32ZBB-NEXT: neg a4, a4
; RV32ZBB-NEXT: or a6, a7, a6
-; RV32ZBB-NEXT: add a1, a3, a1
+; RV32ZBB-NEXT: add a1, a2, a1
; RV32ZBB-NEXT: sub a4, a4, a7
-; RV32ZBB-NEXT: sltu a3, a5, a6
+; RV32ZBB-NEXT: sltu a2, a5, a6
; RV32ZBB-NEXT: neg a7, a1
; RV32ZBB-NEXT: sub a1, a5, a6
-; RV32ZBB-NEXT: sub a3, a7, a3
-; RV32ZBB-NEXT: neg a2, a2
+; RV32ZBB-NEXT: sub a2, a7, a2
+; RV32ZBB-NEXT: neg a3, a3
; RV32ZBB-NEXT: .LBB9_2:
-; RV32ZBB-NEXT: sw a2, 0(a0)
+; RV32ZBB-NEXT: sw a3, 0(a0)
; RV32ZBB-NEXT: sw a4, 4(a0)
; RV32ZBB-NEXT: sw a1, 8(a0)
-; RV32ZBB-NEXT: sw a3, 12(a0)
+; RV32ZBB-NEXT: sw a2, 12(a0)
; RV32ZBB-NEXT: ret
;
; RV64I-LABEL: select_abs128:
diff --git a/llvm/test/CodeGen/RISCV/macro-fusions.mir b/llvm/test/CodeGen/RISCV/macro-fusions.mir
index 1346414..ae5b52d 100644
--- a/llvm/test/CodeGen/RISCV/macro-fusions.mir
+++ b/llvm/test/CodeGen/RISCV/macro-fusions.mir
@@ -2,7 +2,12 @@
# RUN: llc -mtriple=riscv64-linux-gnu -x=mir < %s \
# RUN: -debug-only=machine-scheduler -start-before=machine-scheduler 2>&1 \
# RUN: -mattr=+lui-addi-fusion,+auipc-addi-fusion,+zexth-fusion,+zextw-fusion,+shifted-zextw-fusion,+ld-add-fusion \
+# RUN: -mattr=+add-load-fusion,+auipc-load-fusion,+lui-load-fusion,+addi-load-fusion \
+# RUN: -mattr=+zba,+shxadd-load-fusion \
# RUN: | FileCheck %s
+# RUN: llc -mtriple=riscv64-linux-gnu -x=mir < %s \
+# RUN: -debug-only=machine-scheduler -start-before=machine-scheduler 2>&1 \
+# RUN: -mattr=+zba,+bfext-fusion | FileCheck --check-prefixes=CHECK-BFEXT %s
# CHECK: lui_addi:%bb.0
# CHECK: Macro fuse: {{.*}}LUI - ADDI
@@ -174,3 +179,1374 @@ body: |
$x11 = COPY %5
PseudoRET
...
+
+# CHECK: add_lb
+# CHECK: Macro fuse: {{.*}}ADD - LB
+---
+name: add_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: add_lh
+# CHECK: Macro fuse: {{.*}}ADD - LH
+---
+name: add_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: add_lw
+# CHECK: Macro fuse: {{.*}}ADD - LW
+---
+name: add_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: add_lbu
+# CHECK: Macro fuse: {{.*}}ADD - LBU
+---
+name: add_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: add_lhu
+# CHECK: Macro fuse: {{.*}}ADD - LHU
+---
+name: add_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: add_lwu
+# CHECK: Macro fuse: {{.*}}ADD - LWU
+---
+name: add_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: auipc_lb
+# CHECK: Macro fuse: {{.*}}AUIPC - LB
+---
+name: auipc_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = AUIPC 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LB %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: auipc_lh
+# CHECK: Macro fuse: {{.*}}AUIPC - LH
+---
+name: auipc_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = AUIPC 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LH %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: auipc_lw
+# CHECK: Macro fuse: {{.*}}AUIPC - LW
+---
+name: auipc_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = AUIPC 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LW %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: auipc_ld
+# CHECK: Macro fuse: {{.*}}AUIPC - LD
+---
+name: auipc_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = AUIPC 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LD %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: auipc_lbu
+# CHECK: Macro fuse: {{.*}}AUIPC - LBU
+---
+name: auipc_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = AUIPC 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LBU %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: auipc_lhu
+# CHECK: Macro fuse: {{.*}}AUIPC - LHU
+---
+name: auipc_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = AUIPC 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LHU %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: auipc_lwu
+# CHECK: Macro fuse: {{.*}}AUIPC - LWU
+---
+name: auipc_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = AUIPC 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LWU %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: lui_lb
+# CHECK: Macro fuse: {{.*}}LUI - LB
+---
+name: lui_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = LUI 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LB %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: lui_lh
+# CHECK: Macro fuse: {{.*}}LUI - LH
+---
+name: lui_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = LUI 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LH %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: lui_lw
+# CHECK: Macro fuse: {{.*}}LUI - LW
+---
+name: lui_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = LUI 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LW %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: lui_ld
+# CHECK: Macro fuse: {{.*}}LUI - LD
+---
+name: lui_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = LUI 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LD %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: lui_lbu
+# CHECK: Macro fuse: {{.*}}LUI - LBU
+---
+name: lui_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = LUI 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LBU %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: lui_lhu
+# CHECK: Macro fuse: {{.*}}LUI - LHU
+---
+name: lui_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = LUI 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LHU %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: lui_lwu
+# CHECK: Macro fuse: {{.*}}LUI - LWU
+---
+name: lui_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = LUI 1
+ %3:gpr = XORI %1, 2
+ %4:gpr = LWU %2, 4
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK-BFEXT: bitfield_extract
+# CHECK-BFEXT: Macro fuse: {{.*}}SLLI - SRLI
+---
+name: bitfield_extract
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10
+ %1:gpr = COPY $x10
+ %2:gpr = SLLI %1, 31
+ %3:gpr = XORI %1, 3
+ %4:gpr = SRLI %2, 48
+ $x10 = COPY %3
+ $x11 = COPY %4
+ PseudoRET
+...
+
+# CHECK: addi_lb
+# CHECK: Macro fuse: {{.*}}ADDI - LB
+---
+name: addi_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADDI %1, 8
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: addi_lh
+# CHECK: Macro fuse: {{.*}}ADDI - LH
+---
+name: addi_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADDI %1, 8
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: addi_lw
+# CHECK: Macro fuse: {{.*}}ADDI - LW
+---
+name: addi_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADDI %1, 8
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: addi_ld
+# CHECK: Macro fuse: {{.*}}ADDI - LD
+---
+name: addi_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADDI %1, 8
+ %4:gpr = XORI %2, 3
+ %5:gpr = LD %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: addi_lbu
+# CHECK: Macro fuse: {{.*}}ADDI - LBU
+---
+name: addi_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADDI %1, 8
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: addi_lhu
+# CHECK: Macro fuse: {{.*}}ADDI - LHU
+---
+name: addi_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADDI %1, 8
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: addi_lwu
+# CHECK: Macro fuse: {{.*}}ADDI - LWU
+---
+name: addi_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADDI %1, 8
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: adduw_lb
+# CHECK: Macro fuse: {{.*}}ADD_UW - LB
+---
+name: adduw_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: adduw_lh
+# CHECK: Macro fuse: {{.*}}ADD_UW - LH
+---
+name: adduw_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: adduw_lw
+# CHECK: Macro fuse: {{.*}}ADD_UW - LW
+---
+name: adduw_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: adduw_ld
+# CHECK: Macro fuse: {{.*}}ADD_UW - LD
+---
+name: adduw_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LD %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: adduw_lbu
+# CHECK: Macro fuse: {{.*}}ADD_UW - LBU
+---
+name: adduw_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: adduw_lhu
+# CHECK: Macro fuse: {{.*}}ADD_UW - LHU
+---
+name: adduw_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: adduw_lwu
+# CHECK: Macro fuse: {{.*}}ADD_UW - LWU
+---
+name: adduw_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 0
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1add_lb
+# CHECK: Macro fuse: {{.*}}SH1ADD - LB
+---
+name: sh1add_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2add_lb
+# CHECK: Macro fuse: {{.*}}SH2ADD - LB
+---
+name: sh2add_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3add_lb
+# CHECK: Macro fuse: {{.*}}SH3ADD - LB
+---
+name: sh3add_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1add_lh
+# CHECK: Macro fuse: {{.*}}SH1ADD - LH
+---
+name: sh1add_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2add_lh
+# CHECK: Macro fuse: {{.*}}SH2ADD - LH
+---
+name: sh2add_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3add_lh
+# CHECK: Macro fuse: {{.*}}SH3ADD - LH
+---
+name: sh3add_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1add_lw
+# CHECK: Macro fuse: {{.*}}SH1ADD - LW
+---
+name: sh1add_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2add_lw
+# CHECK: Macro fuse: {{.*}}SH2ADD - LW
+---
+name: sh2add_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3add_lw
+# CHECK: Macro fuse: {{.*}}SH3ADD - LW
+---
+name: sh3add_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1add_ld
+# CHECK: Macro fuse: {{.*}}SH1ADD - LD
+---
+name: sh1add_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LD %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2add_ld
+# CHECK: Macro fuse: {{.*}}SH2ADD - LD
+---
+name: sh2add_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LD %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3add_ld
+# CHECK: Macro fuse: {{.*}}SH3ADD - LD
+---
+name: sh3add_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LD %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1add_lbu
+# CHECK: Macro fuse: {{.*}}SH1ADD - LBU
+---
+name: sh1add_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2add_lbu
+# CHECK: Macro fuse: {{.*}}SH2ADD - LBU
+---
+name: sh2add_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3add_lbu
+# CHECK: Macro fuse: {{.*}}SH3ADD - LBU
+---
+name: sh3add_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1add_lhu
+# CHECK: Macro fuse: {{.*}}SH1ADD - LHU
+---
+name: sh1add_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2add_lhu
+# CHECK: Macro fuse: {{.*}}SH2ADD - LHU
+---
+name: sh2add_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3add_lhu
+# CHECK: Macro fuse: {{.*}}SH3ADD - LHU
+---
+name: sh3add_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1add_lwu
+# CHECK: Macro fuse: {{.*}}SH1ADD - LWU
+---
+name: sh1add_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2add_lwu
+# CHECK: Macro fuse: {{.*}}SH2ADD - LWU
+---
+name: sh2add_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3add_lwu
+# CHECK: Macro fuse: {{.*}}SH3ADD - LWU
+---
+name: sh3add_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1adduw_lb
+# CHECK: Macro fuse: {{.*}}SH1ADD_UW - LB
+---
+name: sh1adduw_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2adduw_lb
+# CHECK: Macro fuse: {{.*}}SH2ADD_UW - LB
+---
+name: sh2adduw_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3adduw_lb
+# CHECK: Macro fuse: {{.*}}SH3ADD_UW - LB
+---
+name: sh3adduw_lb
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LB %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1adduw_lh
+# CHECK: Macro fuse: {{.*}}SH1ADD_UW - LH
+---
+name: sh1adduw_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2adduw_lh
+# CHECK: Macro fuse: {{.*}}SH2ADD_UW - LH
+---
+name: sh2adduw_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3adduw_lh
+# CHECK: Macro fuse: {{.*}}SH3ADD_UW - LH
+---
+name: sh3adduw_lh
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LH %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1adduw_lw
+# CHECK: Macro fuse: {{.*}}SH1ADD_UW - LW
+---
+name: sh1adduw_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2adduw_lw
+# CHECK: Macro fuse: {{.*}}SH2ADD_UW - LW
+---
+name: sh2adduw_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3adduw_lw
+# CHECK: Macro fuse: {{.*}}SH3ADD_UW - LW
+---
+name: sh3adduw_lw
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LW %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1adduw_ld
+# CHECK: Macro fuse: {{.*}}SH1ADD_UW - LD
+---
+name: sh1adduw_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LD %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2adduw_ld
+# CHECK: Macro fuse: {{.*}}SH2ADD_UW - LD
+---
+name: sh2adduw_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LD %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3adduw_ld
+# CHECK: Macro fuse: {{.*}}SH3ADD_UW - LD
+---
+name: sh3adduw_ld
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LD %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1adduw_lbu
+# CHECK: Macro fuse: {{.*}}SH1ADD_UW - LBU
+---
+name: sh1adduw_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2adduw_lbu
+# CHECK: Macro fuse: {{.*}}SH2ADD_UW - LBU
+---
+name: sh2adduw_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3adduw_lbu
+# CHECK: Macro fuse: {{.*}}SH3ADD_UW - LBU
+---
+name: sh3adduw_lbu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LBU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1adduw_lhu
+# CHECK: Macro fuse: {{.*}}SH1ADD_UW - LHU
+---
+name: sh1adduw_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2adduw_lhu
+# CHECK: Macro fuse: {{.*}}SH2ADD_UW - LHU
+---
+name: sh2adduw_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3adduw_lhu
+# CHECK: Macro fuse: {{.*}}SH3ADD_UW - LHU
+---
+name: sh3adduw_lhu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LHU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh1adduw_lwu
+# CHECK: Macro fuse: {{.*}}SH1ADD_UW - LWU
+---
+name: sh1adduw_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH1ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh2adduw_lwu
+# CHECK: Macro fuse: {{.*}}SH2ADD_UW - LWU
+---
+name: sh2adduw_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH2ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
+
+# CHECK: sh3adduw_lwu
+# CHECK: Macro fuse: {{.*}}SH3ADD_UW - LWU
+---
+name: sh3adduw_lwu
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ %1:gpr = COPY $x10
+ %2:gpr = COPY $x11
+ %3:gpr = SH3ADD_UW %1, %2
+ %4:gpr = XORI %2, 3
+ %5:gpr = LWU %3, 8
+ $x10 = COPY %4
+ $x11 = COPY %5
+ PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
index 160f0ae..abdc1ba 100644
--- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
+++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
@@ -1,17 +1,42 @@
; REQUIRES: asserts
-; RUN: llc -mtriple=riscv32 -verify-misched -riscv-misched-load-store-clustering=false \
+;
+; Disable all misched clustering
+; RUN: llc -mtriple=riscv32 -verify-misched \
+; RUN: -mattr=+disable-misched-load-clustering,+disable-misched-store-clustering \
; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
; RUN: | FileCheck -check-prefix=NOCLUSTER %s
-; RUN: llc -mtriple=riscv64 -verify-misched -riscv-misched-load-store-clustering=false \
+; RUN: llc -mtriple=riscv64 -verify-misched \
+; RUN: -mattr=+disable-misched-load-clustering,+disable-misched-store-clustering \
; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
; RUN: | FileCheck -check-prefix=NOCLUSTER %s
+;
+; ST misched clustering only
+; RUN: llc -mtriple=riscv32 -verify-misched \
+; RUN: -mattr=+disable-misched-load-clustering \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=STCLUSTER %s
+; RUN: llc -mtriple=riscv64 -verify-misched \
+; RUN: -mattr=+disable-misched-load-clustering \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=STCLUSTER %s
+;
+; LD misched clustering only
; RUN: llc -mtriple=riscv32 -verify-misched \
+; RUN: -mattr=+disable-misched-store-clustering \
; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
; RUN: | FileCheck -check-prefix=LDCLUSTER %s
; RUN: llc -mtriple=riscv64 -verify-misched \
+; RUN: -mattr=+disable-misched-store-clustering \
; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
; RUN: | FileCheck -check-prefix=LDCLUSTER %s
-
+;
+; Default misched cluster settings (i.e. both LD and ST clustering)
+; RUN: llc -mtriple=riscv32 -verify-misched \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=DEFAULTCLUSTER %s
+; RUN: llc -mtriple=riscv64 -verify-misched \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=DEFAULTCLUSTER %s
define i32 @load_clustering_1(ptr nocapture %p) {
; NOCLUSTER: ********** MI Scheduling **********
@@ -22,6 +47,14 @@ define i32 @load_clustering_1(ptr nocapture %p) {
; NOCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
; NOCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
;
+; STCLUSTER: ********** MI Scheduling **********
+; STCLUSTER-LABEL: load_clustering_1:%bb.0
+; STCLUSTER: *** Final schedule for %bb.0 ***
+; STCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
+; STCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
+; STCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
+; STCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
+;
; LDCLUSTER: ********** MI Scheduling **********
; LDCLUSTER-LABEL: load_clustering_1:%bb.0
; LDCLUSTER: *** Final schedule for %bb.0 ***
@@ -29,6 +62,14 @@ define i32 @load_clustering_1(ptr nocapture %p) {
; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
+;
+; DEFAULTCLUSTER: ********** MI Scheduling **********
+; DEFAULTCLUSTER-LABEL: load_clustering_1:%bb.0
+; DEFAULTCLUSTER: *** Final schedule for %bb.0 ***
+; DEFAULTCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
+; DEFAULTCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
+; DEFAULTCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
+; DEFAULTCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
entry:
%arrayidx0 = getelementptr inbounds i32, ptr %p, i32 3
%val0 = load i32, ptr %arrayidx0
diff --git a/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir b/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir
index 21398d3..01960f9 100644
--- a/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir
+++ b/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir
@@ -1,10 +1,12 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -verify-misched -enable-post-misched=false \
-# RUN: -riscv-postmisched-load-store-clustering=false -debug-only=machine-scheduler \
+# RUN: -mattr=+disable-postmisched-load-clustering \
+# RUN: -mattr=+disable-postmisched-store-clustering -debug-only=machine-scheduler \
# RUN: -start-before=machine-scheduler -stop-after=postmisched -misched-regpressure=false -o - 2>&1 < %s \
# RUN: | FileCheck -check-prefix=NOPOSTMISCHED %s
# RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -mattr=+use-postra-scheduler -verify-misched -enable-post-misched=true \
-# RUN: -riscv-postmisched-load-store-clustering=false -debug-only=machine-scheduler \
+# RUN: -mattr=+disable-postmisched-load-clustering \
+# RUN: -mattr=+disable-postmisched-store-clustering -debug-only=machine-scheduler \
# RUN: -start-before=machine-scheduler -stop-after=postmisched -misched-regpressure=false -o - 2>&1 < %s \
# RUN: | FileCheck -check-prefix=NOCLUSTER %s
# RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -mattr=+use-postra-scheduler -verify-misched -enable-post-misched=true \
diff --git a/llvm/test/CodeGen/RISCV/misched-store-clustering.ll b/llvm/test/CodeGen/RISCV/misched-store-clustering.ll
new file mode 100644
index 0000000..02e853d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/misched-store-clustering.ll
@@ -0,0 +1,83 @@
+; REQUIRES: asserts
+;
+; Disable all misched clustering
+; RUN: llc -mtriple=riscv32 -verify-misched \
+; RUN: -mattr=+disable-misched-load-clustering,+disable-misched-store-clustering \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=NOCLUSTER %s
+; RUN: llc -mtriple=riscv64 -verify-misched \
+; RUN: -mattr=+disable-misched-load-clustering,+disable-misched-store-clustering \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=NOCLUSTER %s
+;
+; ST misched clustering only
+; RUN: llc -mtriple=riscv32 -verify-misched \
+; RUN: -mattr=+disable-misched-load-clustering \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=STCLUSTER %s
+; RUN: llc -mtriple=riscv64 -verify-misched \
+; RUN: -mattr=+disable-misched-load-clustering \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=STCLUSTER %s
+;
+; LD misched clustering only
+; RUN: llc -mtriple=riscv32 -verify-misched \
+; RUN: -mattr=+disable-misched-store-clustering \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=LDCLUSTER %s
+; RUN: llc -mtriple=riscv64 -verify-misched \
+; RUN: -mattr=+disable-misched-store-clustering \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=LDCLUSTER %s
+;
+; Default misched cluster settings (i.e. both LD and ST clustering)
+; RUN: llc -mtriple=riscv32 -verify-misched \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=DEFAULTCLUSTER %s
+; RUN: llc -mtriple=riscv64 -verify-misched \
+; RUN: -debug-only=machine-scheduler -o - 2>&1 < %s \
+; RUN: | FileCheck -check-prefix=DEFAULTCLUSTER %s
+
+define i32 @store_clustering_1(ptr nocapture %p, i32 %v) {
+; NOCLUSTER: ********** MI Scheduling **********
+; NOCLUSTER-LABEL: store_clustering_1:%bb.0
+; NOCLUSTER: *** Final schedule for %bb.0 ***
+; NOCLUSTER: SU(2): SW %1:gpr, %0:gpr, 12 :: (store (s32) into %ir.arrayidx0)
+; NOCLUSTER: SU(3): SW %1:gpr, %0:gpr, 8 :: (store (s32) into %ir.arrayidx1)
+; NOCLUSTER: SU(4): SW %1:gpr, %0:gpr, 4 :: (store (s32) into %ir.arrayidx2)
+; NOCLUSTER: SU(5): SW %1:gpr, %0:gpr, 16 :: (store (s32) into %ir.arrayidx3)
+;
+; STCLUSTER: ********** MI Scheduling **********
+; STCLUSTER-LABEL: store_clustering_1:%bb.0
+; STCLUSTER: *** Final schedule for %bb.0 ***
+; STCLUSTER: SU(4): SW %1:gpr, %0:gpr, 4 :: (store (s32) into %ir.arrayidx2)
+; STCLUSTER: SU(3): SW %1:gpr, %0:gpr, 8 :: (store (s32) into %ir.arrayidx1)
+; STCLUSTER: SU(2): SW %1:gpr, %0:gpr, 12 :: (store (s32) into %ir.arrayidx0)
+; STCLUSTER: SU(5): SW %1:gpr, %0:gpr, 16 :: (store (s32) into %ir.arrayidx3)
+;
+; LDCLUSTER: ********** MI Scheduling **********
+; LDCLUSTER-LABEL: store_clustering_1:%bb.0
+; LDCLUSTER: *** Final schedule for %bb.0 ***
+; LDCLUSTER: SU(2): SW %1:gpr, %0:gpr, 12 :: (store (s32) into %ir.arrayidx0)
+; LDCLUSTER: SU(3): SW %1:gpr, %0:gpr, 8 :: (store (s32) into %ir.arrayidx1)
+; LDCLUSTER: SU(4): SW %1:gpr, %0:gpr, 4 :: (store (s32) into %ir.arrayidx2)
+; LDCLUSTER: SU(5): SW %1:gpr, %0:gpr, 16 :: (store (s32) into %ir.arrayidx3)
+;
+; DEFAULTCLUSTER: ********** MI Scheduling **********
+; DEFAULTCLUSTER-LABEL: store_clustering_1:%bb.0
+; DEFAULTCLUSTER: *** Final schedule for %bb.0 ***
+; DEFAULTCLUSTER: SU(4): SW %1:gpr, %0:gpr, 4 :: (store (s32) into %ir.arrayidx2)
+; DEFAULTCLUSTER: SU(3): SW %1:gpr, %0:gpr, 8 :: (store (s32) into %ir.arrayidx1)
+; DEFAULTCLUSTER: SU(2): SW %1:gpr, %0:gpr, 12 :: (store (s32) into %ir.arrayidx0)
+; DEFAULTCLUSTER: SU(5): SW %1:gpr, %0:gpr, 16 :: (store (s32) into %ir.arrayidx3)
+entry:
+ %arrayidx0 = getelementptr inbounds i32, ptr %p, i32 3
+ store i32 %v, ptr %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, ptr %p, i32 2
+ store i32 %v, ptr %arrayidx1
+ %arrayidx2 = getelementptr inbounds i32, ptr %p, i32 1
+ store i32 %v, ptr %arrayidx2
+ %arrayidx3 = getelementptr inbounds i32, ptr %p, i32 4
+ store i32 %v, ptr %arrayidx3
+ ret i32 %v
+}
diff --git a/llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll b/llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll
index 24d63cb..efc4439 100644
--- a/llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll
+++ b/llvm/test/CodeGen/RISCV/note-gnu-property-zicfiss.ll
@@ -7,19 +7,18 @@
; ASM: .section ".note.GNU-stack","",@progbits
; ASM-NEXT: .section .note.gnu.property,"a",@note
+; ASM32-NEXT: .p2align 2, 0x0
+; ASM64-NEXT: .p2align 3, 0x0
; ASM-NEXT: .word 4
-; ASM-NEXT: .word .Ltmp1-.Ltmp0
+; ASM32-NEXT: .word 12
+; ASM64-NEXT: .word 16
; ASM-NEXT: .word 5
; ASM-NEXT: .asciz "GNU"
-; ASM-NEXT: .Ltmp0:
-; ASM32-NEXT: .p2align 2, 0x0
-; ASM64-NEXT: .p2align 3, 0x0
; ASM-NEXT: .word 3221225472
; ASM-NEXT: .word 4
; ASM-NEXT: .word 2
; ASM32-NEXT: .p2align 2, 0x0
; ASM64-NEXT: .p2align 3, 0x0
-; ASM-NEXT: .Ltmp1:
define i32 @f() "hw-shadow-stack" {
entry:
diff --git a/llvm/test/CodeGen/RISCV/rnmi-interrupt-attr-error.ll b/llvm/test/CodeGen/RISCV/rnmi-interrupt-attr-error.ll
new file mode 100644
index 0000000..19cc994
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rnmi-interrupt-attr-error.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -mtriple riscv32-unknown-elf -mattr=-smrnmi -o - %s 2>&1 \
+; RUN: | FileCheck %s
+; RUN: not llc -mtriple riscv64-unknown-elf -mattr=-smrnmi -o - %s 2>&1 \
+; RUN: | FileCheck %s
+
+; CHECK: LLVM ERROR: 'rnmi' interrupt kind requires Srnmi extension
+define void @test_rnmi() "interrupt"="rnmi" {
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rnmi-interrupt-attr.ll b/llvm/test/CodeGen/RISCV/rnmi-interrupt-attr.ll
new file mode 100644
index 0000000..03236a0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rnmi-interrupt-attr.ll
@@ -0,0 +1,373 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple riscv32-unknown-elf -mattr=+smrnmi -o - %s \
+; RUN: -verify-machineinstrs | FileCheck --check-prefix=RNMI-RV32 %s
+
+; RUN: llc -mtriple riscv32-unknown-elf -mattr=+smrnmi -o - %s \
+; RUN: -verify-machineinstrs -frame-pointer=all | FileCheck --check-prefix=RNMI-RV32-FP %s
+
+; RUN: llc -mtriple riscv64-unknown-elf -mattr=+smrnmi -o - %s \
+; RUN: -verify-machineinstrs | FileCheck --check-prefix=RNMI-RV64 %s
+
+; RUN: llc -mtriple riscv64-unknown-elf -mattr=+smrnmi -o - %s \
+; RUN: -verify-machineinstrs -frame-pointer=all | FileCheck --check-prefix=RNMI-RV64-FP %s
+
+define void @test_rnmi_empty() "interrupt"="rnmi" {
+; RNMI-RV32-LABEL: test_rnmi_empty:
+; RNMI-RV32: # %bb.0:
+; RNMI-RV32-NEXT: mnret
+;
+; RNMI-RV32-FP-LABEL: test_rnmi_empty:
+; RNMI-RV32-FP: # %bb.0:
+; RNMI-RV32-FP-NEXT: addi sp, sp, -16
+; RNMI-RV32-FP-NEXT: .cfi_def_cfa_offset 16
+; RNMI-RV32-FP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: .cfi_offset ra, -4
+; RNMI-RV32-FP-NEXT: .cfi_offset s0, -8
+; RNMI-RV32-FP-NEXT: addi s0, sp, 16
+; RNMI-RV32-FP-NEXT: .cfi_def_cfa s0, 0
+; RNMI-RV32-FP-NEXT: .cfi_def_cfa sp, 16
+; RNMI-RV32-FP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: .cfi_restore ra
+; RNMI-RV32-FP-NEXT: .cfi_restore s0
+; RNMI-RV32-FP-NEXT: addi sp, sp, 16
+; RNMI-RV32-FP-NEXT: .cfi_def_cfa_offset 0
+; RNMI-RV32-FP-NEXT: mnret
+;
+; RNMI-RV64-LABEL: test_rnmi_empty:
+; RNMI-RV64: # %bb.0:
+; RNMI-RV64-NEXT: mnret
+;
+; RNMI-RV64-FP-LABEL: test_rnmi_empty:
+; RNMI-RV64-FP: # %bb.0:
+; RNMI-RV64-FP-NEXT: addi sp, sp, -16
+; RNMI-RV64-FP-NEXT: .cfi_def_cfa_offset 16
+; RNMI-RV64-FP-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: .cfi_offset ra, -8
+; RNMI-RV64-FP-NEXT: .cfi_offset s0, -16
+; RNMI-RV64-FP-NEXT: addi s0, sp, 16
+; RNMI-RV64-FP-NEXT: .cfi_def_cfa s0, 0
+; RNMI-RV64-FP-NEXT: .cfi_def_cfa sp, 16
+; RNMI-RV64-FP-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: .cfi_restore ra
+; RNMI-RV64-FP-NEXT: .cfi_restore s0
+; RNMI-RV64-FP-NEXT: addi sp, sp, 16
+; RNMI-RV64-FP-NEXT: .cfi_def_cfa_offset 0
+; RNMI-RV64-FP-NEXT: mnret
+ ret void
+}
+
+declare void @callee()
+
+define void @test_rnmi_caller() "interrupt"="rnmi" {
+; RNMI-RV32-LABEL: test_rnmi_caller:
+; RNMI-RV32: # %bb.0:
+; RNMI-RV32-NEXT: addi sp, sp, -64
+; RNMI-RV32-NEXT: .cfi_def_cfa_offset 64
+; RNMI-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw t0, 56(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw t1, 52(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw t2, 48(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw a0, 44(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw a1, 40(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw a2, 36(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw a3, 32(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw a4, 28(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw a5, 24(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw a6, 20(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw a7, 16(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw t3, 12(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw t4, 8(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw t5, 4(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: sw t6, 0(sp) # 4-byte Folded Spill
+; RNMI-RV32-NEXT: .cfi_offset ra, -4
+; RNMI-RV32-NEXT: .cfi_offset t0, -8
+; RNMI-RV32-NEXT: .cfi_offset t1, -12
+; RNMI-RV32-NEXT: .cfi_offset t2, -16
+; RNMI-RV32-NEXT: .cfi_offset a0, -20
+; RNMI-RV32-NEXT: .cfi_offset a1, -24
+; RNMI-RV32-NEXT: .cfi_offset a2, -28
+; RNMI-RV32-NEXT: .cfi_offset a3, -32
+; RNMI-RV32-NEXT: .cfi_offset a4, -36
+; RNMI-RV32-NEXT: .cfi_offset a5, -40
+; RNMI-RV32-NEXT: .cfi_offset a6, -44
+; RNMI-RV32-NEXT: .cfi_offset a7, -48
+; RNMI-RV32-NEXT: .cfi_offset t3, -52
+; RNMI-RV32-NEXT: .cfi_offset t4, -56
+; RNMI-RV32-NEXT: .cfi_offset t5, -60
+; RNMI-RV32-NEXT: .cfi_offset t6, -64
+; RNMI-RV32-NEXT: call callee
+; RNMI-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw t0, 56(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw t1, 52(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw t2, 48(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw a0, 44(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw a1, 40(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw a2, 36(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw a4, 28(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw a7, 16(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw t3, 12(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw t4, 8(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw t5, 4(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: lw t6, 0(sp) # 4-byte Folded Reload
+; RNMI-RV32-NEXT: .cfi_restore ra
+; RNMI-RV32-NEXT: .cfi_restore t0
+; RNMI-RV32-NEXT: .cfi_restore t1
+; RNMI-RV32-NEXT: .cfi_restore t2
+; RNMI-RV32-NEXT: .cfi_restore a0
+; RNMI-RV32-NEXT: .cfi_restore a1
+; RNMI-RV32-NEXT: .cfi_restore a2
+; RNMI-RV32-NEXT: .cfi_restore a3
+; RNMI-RV32-NEXT: .cfi_restore a4
+; RNMI-RV32-NEXT: .cfi_restore a5
+; RNMI-RV32-NEXT: .cfi_restore a6
+; RNMI-RV32-NEXT: .cfi_restore a7
+; RNMI-RV32-NEXT: .cfi_restore t3
+; RNMI-RV32-NEXT: .cfi_restore t4
+; RNMI-RV32-NEXT: .cfi_restore t5
+; RNMI-RV32-NEXT: .cfi_restore t6
+; RNMI-RV32-NEXT: addi sp, sp, 64
+; RNMI-RV32-NEXT: .cfi_def_cfa_offset 0
+; RNMI-RV32-NEXT: mnret
+;
+; RNMI-RV32-FP-LABEL: test_rnmi_caller:
+; RNMI-RV32-FP: # %bb.0:
+; RNMI-RV32-FP-NEXT: addi sp, sp, -80
+; RNMI-RV32-FP-NEXT: .cfi_def_cfa_offset 80
+; RNMI-RV32-FP-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw t0, 72(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw t1, 68(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw t2, 64(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw a0, 56(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw a1, 52(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw a2, 48(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw a3, 44(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw a4, 40(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw a5, 36(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw a6, 32(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw a7, 28(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw t3, 24(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw t4, 20(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw t5, 16(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: sw t6, 12(sp) # 4-byte Folded Spill
+; RNMI-RV32-FP-NEXT: .cfi_offset ra, -4
+; RNMI-RV32-FP-NEXT: .cfi_offset t0, -8
+; RNMI-RV32-FP-NEXT: .cfi_offset t1, -12
+; RNMI-RV32-FP-NEXT: .cfi_offset t2, -16
+; RNMI-RV32-FP-NEXT: .cfi_offset s0, -20
+; RNMI-RV32-FP-NEXT: .cfi_offset a0, -24
+; RNMI-RV32-FP-NEXT: .cfi_offset a1, -28
+; RNMI-RV32-FP-NEXT: .cfi_offset a2, -32
+; RNMI-RV32-FP-NEXT: .cfi_offset a3, -36
+; RNMI-RV32-FP-NEXT: .cfi_offset a4, -40
+; RNMI-RV32-FP-NEXT: .cfi_offset a5, -44
+; RNMI-RV32-FP-NEXT: .cfi_offset a6, -48
+; RNMI-RV32-FP-NEXT: .cfi_offset a7, -52
+; RNMI-RV32-FP-NEXT: .cfi_offset t3, -56
+; RNMI-RV32-FP-NEXT: .cfi_offset t4, -60
+; RNMI-RV32-FP-NEXT: .cfi_offset t5, -64
+; RNMI-RV32-FP-NEXT: .cfi_offset t6, -68
+; RNMI-RV32-FP-NEXT: addi s0, sp, 80
+; RNMI-RV32-FP-NEXT: .cfi_def_cfa s0, 0
+; RNMI-RV32-FP-NEXT: call callee
+; RNMI-RV32-FP-NEXT: .cfi_def_cfa sp, 80
+; RNMI-RV32-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw t0, 72(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw t1, 68(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw t2, 64(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw a0, 56(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw a1, 52(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw a2, 48(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw a3, 44(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw a4, 40(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw a5, 36(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw a6, 32(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw a7, 28(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw t3, 24(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw t4, 20(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw t5, 16(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: lw t6, 12(sp) # 4-byte Folded Reload
+; RNMI-RV32-FP-NEXT: .cfi_restore ra
+; RNMI-RV32-FP-NEXT: .cfi_restore t0
+; RNMI-RV32-FP-NEXT: .cfi_restore t1
+; RNMI-RV32-FP-NEXT: .cfi_restore t2
+; RNMI-RV32-FP-NEXT: .cfi_restore s0
+; RNMI-RV32-FP-NEXT: .cfi_restore a0
+; RNMI-RV32-FP-NEXT: .cfi_restore a1
+; RNMI-RV32-FP-NEXT: .cfi_restore a2
+; RNMI-RV32-FP-NEXT: .cfi_restore a3
+; RNMI-RV32-FP-NEXT: .cfi_restore a4
+; RNMI-RV32-FP-NEXT: .cfi_restore a5
+; RNMI-RV32-FP-NEXT: .cfi_restore a6
+; RNMI-RV32-FP-NEXT: .cfi_restore a7
+; RNMI-RV32-FP-NEXT: .cfi_restore t3
+; RNMI-RV32-FP-NEXT: .cfi_restore t4
+; RNMI-RV32-FP-NEXT: .cfi_restore t5
+; RNMI-RV32-FP-NEXT: .cfi_restore t6
+; RNMI-RV32-FP-NEXT: addi sp, sp, 80
+; RNMI-RV32-FP-NEXT: .cfi_def_cfa_offset 0
+; RNMI-RV32-FP-NEXT: mnret
+;
+; RNMI-RV64-LABEL: test_rnmi_caller:
+; RNMI-RV64: # %bb.0:
+; RNMI-RV64-NEXT: addi sp, sp, -128
+; RNMI-RV64-NEXT: .cfi_def_cfa_offset 128
+; RNMI-RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd t0, 112(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd t1, 104(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd t2, 96(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd a0, 88(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd a1, 80(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd a2, 72(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd a3, 64(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd a4, 56(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd a5, 48(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd a6, 40(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd a7, 32(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd t3, 24(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd t4, 16(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd t5, 8(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: sd t6, 0(sp) # 8-byte Folded Spill
+; RNMI-RV64-NEXT: .cfi_offset ra, -8
+; RNMI-RV64-NEXT: .cfi_offset t0, -16
+; RNMI-RV64-NEXT: .cfi_offset t1, -24
+; RNMI-RV64-NEXT: .cfi_offset t2, -32
+; RNMI-RV64-NEXT: .cfi_offset a0, -40
+; RNMI-RV64-NEXT: .cfi_offset a1, -48
+; RNMI-RV64-NEXT: .cfi_offset a2, -56
+; RNMI-RV64-NEXT: .cfi_offset a3, -64
+; RNMI-RV64-NEXT: .cfi_offset a4, -72
+; RNMI-RV64-NEXT: .cfi_offset a5, -80
+; RNMI-RV64-NEXT: .cfi_offset a6, -88
+; RNMI-RV64-NEXT: .cfi_offset a7, -96
+; RNMI-RV64-NEXT: .cfi_offset t3, -104
+; RNMI-RV64-NEXT: .cfi_offset t4, -112
+; RNMI-RV64-NEXT: .cfi_offset t5, -120
+; RNMI-RV64-NEXT: .cfi_offset t6, -128
+; RNMI-RV64-NEXT: call callee
+; RNMI-RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld t0, 112(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld t1, 104(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld t2, 96(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld a0, 88(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld a1, 80(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld a2, 72(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld a3, 64(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld a4, 56(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld a5, 48(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld a6, 40(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld a7, 32(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld t3, 24(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld t4, 16(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld t5, 8(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: ld t6, 0(sp) # 8-byte Folded Reload
+; RNMI-RV64-NEXT: .cfi_restore ra
+; RNMI-RV64-NEXT: .cfi_restore t0
+; RNMI-RV64-NEXT: .cfi_restore t1
+; RNMI-RV64-NEXT: .cfi_restore t2
+; RNMI-RV64-NEXT: .cfi_restore a0
+; RNMI-RV64-NEXT: .cfi_restore a1
+; RNMI-RV64-NEXT: .cfi_restore a2
+; RNMI-RV64-NEXT: .cfi_restore a3
+; RNMI-RV64-NEXT: .cfi_restore a4
+; RNMI-RV64-NEXT: .cfi_restore a5
+; RNMI-RV64-NEXT: .cfi_restore a6
+; RNMI-RV64-NEXT: .cfi_restore a7
+; RNMI-RV64-NEXT: .cfi_restore t3
+; RNMI-RV64-NEXT: .cfi_restore t4
+; RNMI-RV64-NEXT: .cfi_restore t5
+; RNMI-RV64-NEXT: .cfi_restore t6
+; RNMI-RV64-NEXT: addi sp, sp, 128
+; RNMI-RV64-NEXT: .cfi_def_cfa_offset 0
+; RNMI-RV64-NEXT: mnret
+;
+; RNMI-RV64-FP-LABEL: test_rnmi_caller:
+; RNMI-RV64-FP: # %bb.0:
+; RNMI-RV64-FP-NEXT: addi sp, sp, -144
+; RNMI-RV64-FP-NEXT: .cfi_def_cfa_offset 144
+; RNMI-RV64-FP-NEXT: sd ra, 136(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd t0, 128(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd t1, 120(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd t2, 112(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd s0, 104(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd a0, 96(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd a1, 88(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd a2, 80(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd a3, 72(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd a4, 64(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd a5, 56(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd a6, 48(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd a7, 40(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd t3, 32(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd t4, 24(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd t5, 16(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: sd t6, 8(sp) # 8-byte Folded Spill
+; RNMI-RV64-FP-NEXT: .cfi_offset ra, -8
+; RNMI-RV64-FP-NEXT: .cfi_offset t0, -16
+; RNMI-RV64-FP-NEXT: .cfi_offset t1, -24
+; RNMI-RV64-FP-NEXT: .cfi_offset t2, -32
+; RNMI-RV64-FP-NEXT: .cfi_offset s0, -40
+; RNMI-RV64-FP-NEXT: .cfi_offset a0, -48
+; RNMI-RV64-FP-NEXT: .cfi_offset a1, -56
+; RNMI-RV64-FP-NEXT: .cfi_offset a2, -64
+; RNMI-RV64-FP-NEXT: .cfi_offset a3, -72
+; RNMI-RV64-FP-NEXT: .cfi_offset a4, -80
+; RNMI-RV64-FP-NEXT: .cfi_offset a5, -88
+; RNMI-RV64-FP-NEXT: .cfi_offset a6, -96
+; RNMI-RV64-FP-NEXT: .cfi_offset a7, -104
+; RNMI-RV64-FP-NEXT: .cfi_offset t3, -112
+; RNMI-RV64-FP-NEXT: .cfi_offset t4, -120
+; RNMI-RV64-FP-NEXT: .cfi_offset t5, -128
+; RNMI-RV64-FP-NEXT: .cfi_offset t6, -136
+; RNMI-RV64-FP-NEXT: addi s0, sp, 144
+; RNMI-RV64-FP-NEXT: .cfi_def_cfa s0, 0
+; RNMI-RV64-FP-NEXT: call callee
+; RNMI-RV64-FP-NEXT: .cfi_def_cfa sp, 144
+; RNMI-RV64-FP-NEXT: ld ra, 136(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld t0, 128(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld t1, 120(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld t2, 112(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld s0, 104(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld a0, 96(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld a1, 88(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld a2, 80(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld a3, 72(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld a4, 64(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld a5, 56(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld a6, 48(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld a7, 40(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld t3, 32(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld t4, 24(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld t5, 16(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: ld t6, 8(sp) # 8-byte Folded Reload
+; RNMI-RV64-FP-NEXT: .cfi_restore ra
+; RNMI-RV64-FP-NEXT: .cfi_restore t0
+; RNMI-RV64-FP-NEXT: .cfi_restore t1
+; RNMI-RV64-FP-NEXT: .cfi_restore t2
+; RNMI-RV64-FP-NEXT: .cfi_restore s0
+; RNMI-RV64-FP-NEXT: .cfi_restore a0
+; RNMI-RV64-FP-NEXT: .cfi_restore a1
+; RNMI-RV64-FP-NEXT: .cfi_restore a2
+; RNMI-RV64-FP-NEXT: .cfi_restore a3
+; RNMI-RV64-FP-NEXT: .cfi_restore a4
+; RNMI-RV64-FP-NEXT: .cfi_restore a5
+; RNMI-RV64-FP-NEXT: .cfi_restore a6
+; RNMI-RV64-FP-NEXT: .cfi_restore a7
+; RNMI-RV64-FP-NEXT: .cfi_restore t3
+; RNMI-RV64-FP-NEXT: .cfi_restore t4
+; RNMI-RV64-FP-NEXT: .cfi_restore t5
+; RNMI-RV64-FP-NEXT: .cfi_restore t6
+; RNMI-RV64-FP-NEXT: addi sp, sp, 144
+; RNMI-RV64-FP-NEXT: .cfi_def_cfa_offset 0
+; RNMI-RV64-FP-NEXT: mnret
+ call void @callee()
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 8dd6301..eb8b769 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1587,59 +1587,59 @@ define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
; CHECK-LABEL: sub_if_uge_i128:
; CHECK: # %bb.0:
-; CHECK-NEXT: lw a7, 4(a2)
-; CHECK-NEXT: lw a6, 8(a2)
-; CHECK-NEXT: lw t0, 12(a2)
; CHECK-NEXT: lw a3, 4(a1)
-; CHECK-NEXT: lw a4, 12(a1)
-; CHECK-NEXT: lw a5, 8(a1)
-; CHECK-NEXT: beq a4, t0, .LBB53_2
+; CHECK-NEXT: lw a4, 8(a1)
+; CHECK-NEXT: lw a5, 12(a1)
+; CHECK-NEXT: lw a6, 4(a2)
+; CHECK-NEXT: lw t0, 12(a2)
+; CHECK-NEXT: lw a7, 8(a2)
+; CHECK-NEXT: beq a5, t0, .LBB53_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sltu t1, a4, t0
+; CHECK-NEXT: sltu t1, a5, t0
; CHECK-NEXT: j .LBB53_3
; CHECK-NEXT: .LBB53_2:
-; CHECK-NEXT: sltu t1, a5, a6
+; CHECK-NEXT: sltu t1, a4, a7
; CHECK-NEXT: .LBB53_3:
-; CHECK-NEXT: lw a2, 0(a2)
; CHECK-NEXT: lw a1, 0(a1)
-; CHECK-NEXT: beq a3, a7, .LBB53_5
+; CHECK-NEXT: lw a2, 0(a2)
+; CHECK-NEXT: beq a3, a6, .LBB53_5
; CHECK-NEXT: # %bb.4:
-; CHECK-NEXT: sltu t2, a3, a7
+; CHECK-NEXT: sltu t2, a3, a6
; CHECK-NEXT: j .LBB53_6
; CHECK-NEXT: .LBB53_5:
; CHECK-NEXT: sltu t2, a1, a2
; CHECK-NEXT: .LBB53_6:
-; CHECK-NEXT: xor t3, a4, t0
-; CHECK-NEXT: xor t4, a5, a6
+; CHECK-NEXT: xor t3, a5, t0
+; CHECK-NEXT: xor t4, a4, a7
; CHECK-NEXT: or t3, t4, t3
; CHECK-NEXT: beqz t3, .LBB53_8
; CHECK-NEXT: # %bb.7:
; CHECK-NEXT: mv t2, t1
; CHECK-NEXT: .LBB53_8:
-; CHECK-NEXT: addi t2, t2, -1
-; CHECK-NEXT: and t1, t2, t0
-; CHECK-NEXT: and t0, t2, a2
-; CHECK-NEXT: and a7, t2, a7
+; CHECK-NEXT: addi t3, t2, -1
+; CHECK-NEXT: and t2, t3, t0
+; CHECK-NEXT: and t0, t3, a2
+; CHECK-NEXT: and t1, t3, a6
; CHECK-NEXT: sltu a2, a1, t0
-; CHECK-NEXT: and t2, t2, a6
+; CHECK-NEXT: and a7, t3, a7
; CHECK-NEXT: mv a6, a2
-; CHECK-NEXT: beq a3, a7, .LBB53_10
+; CHECK-NEXT: beq a3, t1, .LBB53_10
; CHECK-NEXT: # %bb.9:
-; CHECK-NEXT: sltu a6, a3, a7
+; CHECK-NEXT: sltu a6, a3, t1
; CHECK-NEXT: .LBB53_10:
-; CHECK-NEXT: sub t3, a5, t2
-; CHECK-NEXT: sltu a5, a5, t2
-; CHECK-NEXT: sub a4, a4, t1
-; CHECK-NEXT: sub a3, a3, a7
+; CHECK-NEXT: sub t3, a4, a7
+; CHECK-NEXT: sltu a4, a4, a7
+; CHECK-NEXT: sub a5, a5, t2
+; CHECK-NEXT: sub a3, a3, t1
; CHECK-NEXT: sub a1, a1, t0
; CHECK-NEXT: sltu a7, t3, a6
-; CHECK-NEXT: sub a4, a4, a5
-; CHECK-NEXT: sub a5, t3, a6
+; CHECK-NEXT: sub a5, a5, a4
+; CHECK-NEXT: sub a4, t3, a6
; CHECK-NEXT: sub a3, a3, a2
-; CHECK-NEXT: sub a2, a4, a7
+; CHECK-NEXT: sub a2, a5, a7
; CHECK-NEXT: sw a1, 0(a0)
; CHECK-NEXT: sw a3, 4(a0)
-; CHECK-NEXT: sw a5, 8(a0)
+; CHECK-NEXT: sw a4, 8(a0)
; CHECK-NEXT: sw a2, 12(a0)
; CHECK-NEXT: ret
%cmp = icmp ult i128 %x, %y
diff --git a/llvm/test/CodeGen/RISCV/rv32zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbkb.ll
index 4aa6dd4..42d326e 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbkb.ll
@@ -319,3 +319,142 @@ define i64 @zext_i16_to_i64(i16 %a) nounwind {
%1 = zext i16 %a to i64
ret i64 %1
}
+
+define i32 @pack_lo_packh_hi_packh(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2, i8 zeroext %3) nounwind {
+; RV32I-LABEL: pack_lo_packh_hi_packh:
+; RV32I: # %bb.0:
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: slli a2, a2, 16
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: or a2, a2, a3
+; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: ret
+;
+; RV32ZBKB-LABEL: pack_lo_packh_hi_packh:
+; RV32ZBKB: # %bb.0:
+; RV32ZBKB-NEXT: packh a0, a0, a1
+; RV32ZBKB-NEXT: packh a1, a2, a3
+; RV32ZBKB-NEXT: pack a0, a0, a1
+; RV32ZBKB-NEXT: ret
+ %a = zext i8 %0 to i32
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = zext i8 %3 to i32
+ %e = shl i32 %b, 8
+ %f = shl i32 %c, 16
+ %g = shl i32 %d, 24
+ %h = or i32 %a, %e
+ %i = or i32 %h, %f
+ %j = or i32 %i, %g
+ ret i32 %j
+}
+
+define i32 @pack_lo_packh_hi_packh_2(i8 %0, i8 %1, i8 %2, i8 %3) nounwind {
+; RV32I-LABEL: pack_lo_packh_hi_packh_2:
+; RV32I: # %bb.0:
+; RV32I-NEXT: zext.b a0, a0
+; RV32I-NEXT: zext.b a1, a1
+; RV32I-NEXT: zext.b a2, a2
+; RV32I-NEXT: slli a3, a3, 24
+; RV32I-NEXT: slli a1, a1, 8
+; RV32I-NEXT: slli a2, a2, 16
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: or a2, a2, a3
+; RV32I-NEXT: or a0, a0, a2
+; RV32I-NEXT: ret
+;
+; RV32ZBKB-LABEL: pack_lo_packh_hi_packh_2:
+; RV32ZBKB: # %bb.0:
+; RV32ZBKB-NEXT: packh a0, a0, a1
+; RV32ZBKB-NEXT: packh a1, a2, a3
+; RV32ZBKB-NEXT: pack a0, a0, a1
+; RV32ZBKB-NEXT: ret
+ %a = zext i8 %0 to i32
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = zext i8 %3 to i32
+ %e = shl i32 %b, 8
+ %f = shl i32 %c, 16
+ %g = shl i32 %d, 24
+ %h = or i32 %a, %e
+ %i = or i32 %h, %f
+ %j = or i32 %i, %g
+ ret i32 %j
+}
+
+define i32 @pack_lo_zext_hi_packh(i16 zeroext %0, i8 zeroext %1, i8 zeroext %2) nounwind {
+; RV32I-LABEL: pack_lo_zext_hi_packh:
+; RV32I: # %bb.0:
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: slli a2, a2, 24
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBKB-LABEL: pack_lo_zext_hi_packh:
+; RV32ZBKB: # %bb.0:
+; RV32ZBKB-NEXT: packh a1, a1, a2
+; RV32ZBKB-NEXT: pack a0, a0, a1
+; RV32ZBKB-NEXT: ret
+ %a = zext i16 %0 to i32
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = shl i32 %c, 8
+ %e = or i32 %b, %d
+ %f = shl i32 %e, 16
+ %g = or i32 %f, %a
+ ret i32 %g
+}
+
+; Negative test, %a isn't extended so we can't use pack for the outer or, but
+; we can use packh for the high half.
+define i32 @pack_lo_noext_hi_packh(i32 %a, i8 zeroext %1, i8 zeroext %2) nounwind {
+; RV32I-LABEL: pack_lo_noext_hi_packh:
+; RV32I: # %bb.0:
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: slli a2, a2, 24
+; RV32I-NEXT: or a1, a2, a1
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBKB-LABEL: pack_lo_noext_hi_packh:
+; RV32ZBKB: # %bb.0:
+; RV32ZBKB-NEXT: packh a1, a1, a2
+; RV32ZBKB-NEXT: slli a1, a1, 16
+; RV32ZBKB-NEXT: or a0, a1, a0
+; RV32ZBKB-NEXT: ret
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = shl i32 %c, 8
+ %e = or i32 %b, %d
+ %f = shl i32 %e, 16
+ %g = or i32 %f, %a
+ ret i32 %g
+}
+
+; Make sure we can match packh+slli without having the input bytes zero extended.
+define i32 @pack_lo_noext_hi_packh_nozeroext(i32 %a, i8 %1, i8 %2) nounwind {
+; RV32I-LABEL: pack_lo_noext_hi_packh_nozeroext:
+; RV32I: # %bb.0:
+; RV32I-NEXT: zext.b a1, a1
+; RV32I-NEXT: slli a2, a2, 24
+; RV32I-NEXT: slli a1, a1, 16
+; RV32I-NEXT: or a0, a2, a0
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV32ZBKB-LABEL: pack_lo_noext_hi_packh_nozeroext:
+; RV32ZBKB: # %bb.0:
+; RV32ZBKB-NEXT: packh a1, a1, a2
+; RV32ZBKB-NEXT: slli a1, a1, 16
+; RV32ZBKB-NEXT: or a0, a1, a0
+; RV32ZBKB-NEXT: ret
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = shl i32 %c, 8
+ %e = or i32 %b, %d
+ %f = shl i32 %e, 16
+ %g = or i32 %f, %a
+ ret i32 %g
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index 1a3beeb7..e3728bf 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -785,16 +785,16 @@ define i32 @bset_trailing_ones_i32_no_mask(i32 %a) nounwind {
define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
; CHECK-LABEL: bset_trailing_ones_i64_mask:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a2, -1
-; CHECK-NEXT: andi a3, a0, 63
-; CHECK-NEXT: addi a1, a3, -32
-; CHECK-NEXT: sll a0, a2, a0
+; CHECK-NEXT: andi a2, a0, 63
+; CHECK-NEXT: li a3, -1
+; CHECK-NEXT: addi a1, a2, -32
+; CHECK-NEXT: sll a0, a3, a0
; CHECK-NEXT: bltz a1, .LBB43_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: sll a2, a2, a3
+; CHECK-NEXT: sll a2, a3, a2
; CHECK-NEXT: j .LBB43_3
; CHECK-NEXT: .LBB43_2:
-; CHECK-NEXT: not a2, a3
+; CHECK-NEXT: not a2, a2
; CHECK-NEXT: lui a3, 524288
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: srl a2, a3, a2
diff --git a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
index 57061e1..f89d1abf 100644
--- a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
@@ -253,8 +253,8 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind {
; RV64IZHINX-NEXT: srli a1, a2, 1
; RV64IZHINX-NEXT: .LBB4_4:
; RV64IZHINX-NEXT: feq.s a2, s0, s0
-; RV64IZHINX-NEXT: neg a3, a3
; RV64IZHINX-NEXT: neg a4, s1
+; RV64IZHINX-NEXT: neg a3, a3
; RV64IZHINX-NEXT: neg a2, a2
; RV64IZHINX-NEXT: and a0, a4, a0
; RV64IZHINX-NEXT: and a1, a2, a1
@@ -334,18 +334,19 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
; RV64IZHINX-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64IZHINX-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; RV64IZHINX-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
-; RV64IZHINX-NEXT: fcvt.s.h a0, a0
-; RV64IZHINX-NEXT: lui a1, 522240
-; RV64IZHINX-NEXT: addi a1, a1, -1
-; RV64IZHINX-NEXT: fle.s a2, zero, a0
-; RV64IZHINX-NEXT: flt.s a1, a1, a0
-; RV64IZHINX-NEXT: neg s0, a1
-; RV64IZHINX-NEXT: neg s1, a2
+; RV64IZHINX-NEXT: fcvt.s.h s0, a0
+; RV64IZHINX-NEXT: fle.s a0, zero, s0
+; RV64IZHINX-NEXT: neg s1, a0
+; RV64IZHINX-NEXT: mv a0, s0
; RV64IZHINX-NEXT: call __fixunssfti
; RV64IZHINX-NEXT: and a0, s1, a0
+; RV64IZHINX-NEXT: lui a2, 522240
; RV64IZHINX-NEXT: and a1, s1, a1
-; RV64IZHINX-NEXT: or a0, s0, a0
-; RV64IZHINX-NEXT: or a1, s0, a1
+; RV64IZHINX-NEXT: addi a2, a2, -1
+; RV64IZHINX-NEXT: flt.s a2, a2, s0
+; RV64IZHINX-NEXT: neg a2, a2
+; RV64IZHINX-NEXT: or a0, a2, a0
+; RV64IZHINX-NEXT: or a1, a2, a1
; RV64IZHINX-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; RV64IZHINX-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; RV64IZHINX-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 818ea72..f2c41db 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -392,3 +392,217 @@ define i64 @zext_i16_to_i64(i16 %a) nounwind {
%1 = zext i16 %a to i64
ret i64 %1
}
+
+define void @pack_lo_packh_hi_packh(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2, i8 zeroext %3, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_packh_hi_packh:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: slli a2, a2, 16
+; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: or a2, a2, a3
+; RV64I-NEXT: or a0, a0, a2
+; RV64I-NEXT: sw a0, 0(a4)
+; RV64I-NEXT: ret
+;
+; RV64ZBKB-LABEL: pack_lo_packh_hi_packh:
+; RV64ZBKB: # %bb.0:
+; RV64ZBKB-NEXT: packh a0, a0, a1
+; RV64ZBKB-NEXT: packh a1, a2, a3
+; RV64ZBKB-NEXT: packw a0, a0, a1
+; RV64ZBKB-NEXT: sw a0, 0(a4)
+; RV64ZBKB-NEXT: ret
+ %a = zext i8 %0 to i32
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = zext i8 %3 to i32
+ %e = shl i32 %b, 8
+ %f = shl i32 %c, 16
+ %g = shl i32 %d, 24
+ %h = or i32 %a, %e
+ %i = or i32 %h, %f
+ %j = or i32 %i, %g
+ store i32 %j, ptr %p
+ ret void
+}
+
+define void @pack_lo_packh_hi_packh_2(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2, i8 zeroext %3, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_packh_hi_packh_2:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: slli a2, a2, 16
+; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: or a2, a2, a3
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: sw a0, 0(a4)
+; RV64I-NEXT: ret
+;
+; RV64ZBKB-LABEL: pack_lo_packh_hi_packh_2:
+; RV64ZBKB: # %bb.0:
+; RV64ZBKB-NEXT: packh a0, a0, a1
+; RV64ZBKB-NEXT: packh a1, a3, a2
+; RV64ZBKB-NEXT: packw a0, a0, a1
+; RV64ZBKB-NEXT: sw a0, 0(a4)
+; RV64ZBKB-NEXT: ret
+ %a = zext i8 %0 to i32
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = zext i8 %3 to i32
+ %e = shl i32 %b, 8
+ %f = shl i32 %c, 16
+ %g = shl i32 %d, 24
+ %h = or i32 %a, %e
+ %i = or i32 %g, %h
+ %j = or i32 %f, %i
+ store i32 %j, ptr %p
+ ret void
+}
+
+define void @pack_lo_packh_hi_packh_3(i8 %0, i8 %1, i8 %2, i8 %3, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_packh_hi_packh_3:
+; RV64I: # %bb.0:
+; RV64I-NEXT: zext.b a0, a0
+; RV64I-NEXT: zext.b a1, a1
+; RV64I-NEXT: zext.b a2, a2
+; RV64I-NEXT: slli a3, a3, 24
+; RV64I-NEXT: slli a1, a1, 8
+; RV64I-NEXT: slli a2, a2, 16
+; RV64I-NEXT: or a0, a3, a0
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: sw a0, 0(a4)
+; RV64I-NEXT: ret
+;
+; RV64ZBKB-LABEL: pack_lo_packh_hi_packh_3:
+; RV64ZBKB: # %bb.0:
+; RV64ZBKB-NEXT: packh a0, a0, a1
+; RV64ZBKB-NEXT: packh a1, a3, a2
+; RV64ZBKB-NEXT: packw a0, a0, a1
+; RV64ZBKB-NEXT: sw a0, 0(a4)
+; RV64ZBKB-NEXT: ret
+ %a = zext i8 %0 to i32
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = zext i8 %3 to i32
+ %e = shl i32 %b, 8
+ %f = shl i32 %c, 16
+ %g = shl i32 %d, 24
+ %h = or i32 %a, %e
+ %i = or i32 %g, %h
+ %j = or i32 %f, %i
+ store i32 %j, ptr %p
+ ret void
+}
+
+define void @pack_lo_zext_hi_packh(i16 zeroext %0, i8 zeroext %1, i8 zeroext %2, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_zext_hi_packh:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: slli a2, a2, 24
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: sw a0, 0(a3)
+; RV64I-NEXT: ret
+;
+; RV64ZBKB-LABEL: pack_lo_zext_hi_packh:
+; RV64ZBKB: # %bb.0:
+; RV64ZBKB-NEXT: packh a1, a1, a2
+; RV64ZBKB-NEXT: packw a0, a0, a1
+; RV64ZBKB-NEXT: sw a0, 0(a3)
+; RV64ZBKB-NEXT: ret
+ %a = zext i16 %0 to i32
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = shl i32 %c, 8
+ %e = or i32 %b, %d
+ %f = shl i32 %e, 16
+ %g = or i32 %f, %a
+ store i32 %g, ptr %p
+ ret void
+}
+
+; Negative test, %a isn't extended so we can't use packw for the outer or, but
+; we can use packh for the high half.
+define void @pack_lo_noext_hi_packh(i32 %a, i8 zeroext %1, i8 zeroext %2, ptr %p) nounwind {
+; RV64I-LABEL: pack_lo_noext_hi_packh:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: slli a2, a2, 24
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: sw a0, 0(a3)
+; RV64I-NEXT: ret
+;
+; RV64ZBKB-LABEL: pack_lo_noext_hi_packh:
+; RV64ZBKB: # %bb.0:
+; RV64ZBKB-NEXT: packh a1, a1, a2
+; RV64ZBKB-NEXT: slli a1, a1, 16
+; RV64ZBKB-NEXT: or a0, a1, a0
+; RV64ZBKB-NEXT: sw a0, 0(a3)
+; RV64ZBKB-NEXT: ret
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = shl i32 %c, 8
+ %e = or i32 %b, %d
+ %f = shl i32 %e, 16
+ %g = or i32 %f, %a
+ store i32 %g, ptr %p
+ ret void
+}
+
+; Make sure we can match packh+slli without having the input bytes zero extended.
+define void @pack_i32_lo_noext_hi_packh_nozeroext(i32 %a, i8 %1, i8 %2, ptr %p) nounwind {
+; RV64I-LABEL: pack_i32_lo_noext_hi_packh_nozeroext:
+; RV64I: # %bb.0:
+; RV64I-NEXT: zext.b a1, a1
+; RV64I-NEXT: slli a2, a2, 24
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: or a0, a2, a0
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: sw a0, 0(a3)
+; RV64I-NEXT: ret
+;
+; RV64ZBKB-LABEL: pack_i32_lo_noext_hi_packh_nozeroext:
+; RV64ZBKB: # %bb.0:
+; RV64ZBKB-NEXT: packh a1, a1, a2
+; RV64ZBKB-NEXT: slli a1, a1, 16
+; RV64ZBKB-NEXT: or a0, a1, a0
+; RV64ZBKB-NEXT: sw a0, 0(a3)
+; RV64ZBKB-NEXT: ret
+ %b = zext i8 %1 to i32
+ %c = zext i8 %2 to i32
+ %d = shl i32 %c, 8
+ %e = or i32 %b, %d
+ %f = shl i32 %e, 16
+ %g = or i32 %f, %a
+ store i32 %g, ptr %p
+ ret void
+}
+
+; Make sure we can match packh+slli without having the input bytes zero extended.
+define i64 @pack_i64_lo_noext_hi_packh_nozeroext(i64 %a, i8 %1, i8 %2, ptr %p) nounwind {
+; RV64I-LABEL: pack_i64_lo_noext_hi_packh_nozeroext:
+; RV64I: # %bb.0:
+; RV64I-NEXT: zext.b a1, a1
+; RV64I-NEXT: zext.b a2, a2
+; RV64I-NEXT: slli a1, a1, 16
+; RV64I-NEXT: slli a2, a2, 24
+; RV64I-NEXT: or a1, a2, a1
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBKB-LABEL: pack_i64_lo_noext_hi_packh_nozeroext:
+; RV64ZBKB: # %bb.0:
+; RV64ZBKB-NEXT: packh a1, a1, a2
+; RV64ZBKB-NEXT: slli a1, a1, 16
+; RV64ZBKB-NEXT: or a0, a1, a0
+; RV64ZBKB-NEXT: ret
+ %b = zext i8 %1 to i64
+ %c = zext i8 %2 to i64
+ %d = shl i64 %c, 8
+ %e = or i64 %b, %d
+ %f = shl i64 %e, 16
+ %g = or i64 %f, %a
+ ret i64 %g
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll
new file mode 100644
index 0000000..5b01976
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll
@@ -0,0 +1,586 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+
+define { <2 x i8>, i32 } @vploadff_v2i8(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i8>, i32 } @llvm.vp.load.ff.v2i8.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x i8>, i32 } %load
+}
+
+define { <2 x i8>, i32 } @vploadff_v2i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i8>, i32 } @llvm.vp.load.ff.v2i8.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x i8>, i32 } %load
+}
+
+define { <4 x i8>, i32 } @vploadff_v4i8(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i8>, i32 } @llvm.vp.load.ff.v4i8.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x i8>, i32 } %load
+}
+
+define { <4 x i8>, i32 } @vploadff_v4i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i8>, i32 } @llvm.vp.load.ff.v4i8.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x i8>, i32 } %load
+}
+
+define { <8 x i8>, i32 } @vploadff_v8i8(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i8>, i32 } @llvm.vp.load.ff.v8i8.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x i8>, i32 } %load
+}
+
+define { <8 x i8>, i32 } @vploadff_v8i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i8>, i32 } @llvm.vp.load.ff.v8i8.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x i8>, i32 } %load
+}
+
+define { <2 x i16>, i32 } @vploadff_v2i16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i16>, i32 } @llvm.vp.load.ff.v2i16.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x i16>, i32 } %load
+}
+
+define { <2 x i16>, i32 } @vploadff_v2i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i16>, i32 } @llvm.vp.load.ff.v2i16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x i16>, i32 } %load
+}
+
+define { <4 x i16>, i32 } @vploadff_v4i16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i16>, i32 } @llvm.vp.load.ff.v4i16.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x i16>, i32 } %load
+}
+
+define { <4 x i16>, i32 } @vploadff_v4i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i16>, i32 } @llvm.vp.load.ff.v4i16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x i16>, i32 } %load
+}
+
+define { <8 x i16>, i32 } @vploadff_v8i16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i16>, i32 } @llvm.vp.load.ff.v8i16.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x i16>, i32 } %load
+}
+
+define { <8 x i16>, i32 } @vploadff_v8i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i16>, i32 } @llvm.vp.load.ff.v8i16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x i16>, i32 } %load
+}
+
+define { <2 x i32>, i32 } @vploadff_v2i32(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i32>, i32 } @llvm.vp.load.ff.v2i32.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x i32>, i32 } %load
+}
+
+define { <2 x i32>, i32 } @vploadff_v2i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i32>, i32 } @llvm.vp.load.ff.v2i32.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x i32>, i32 } %load
+}
+
+define { <4 x i32>, i32 } @vploadff_v4i32(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i32>, i32 } @llvm.vp.load.ff.v4i32.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x i32>, i32 } %load
+}
+
+define { <4 x i32>, i32 } @vploadff_v4i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i32>, i32 } @llvm.vp.load.ff.v4i32.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x i32>, i32 } %load
+}
+
+define { <8 x i32>, i32 } @vploadff_v8i32(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i32>, i32 } @llvm.vp.load.ff.v8i32.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x i32>, i32 } %load
+}
+
+define { <8 x i32>, i32 } @vploadff_v8i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i32>, i32 } @llvm.vp.load.ff.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x i32>, i32 } %load
+}
+
+define { <2 x i64>, i32 } @vploadff_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x i64>, i32 } %load
+}
+
+define { <2 x i64>, i32 } @vploadff_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x i64>, i32 } %load
+}
+
+define { <4 x i64>, i32 } @vploadff_v4i64(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i64>, i32 } @llvm.vp.load.ff.v4i64.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x i64>, i32 } %load
+}
+
+define { <4 x i64>, i32 } @vploadff_v4i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x i64>, i32 } @llvm.vp.load.ff.v4i64.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x i64>, i32 } %load
+}
+
+define { <8 x i64>, i32 } @vploadff_v8i64(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i64>, i32 } @llvm.vp.load.ff.v8i64.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x i64>, i32 } %load
+}
+
+define { <8 x i64>, i32 } @vploadff_v8i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x i64>, i32 } @llvm.vp.load.ff.v8i64.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x i64>, i32 } %load
+}
+
+define { <32 x i64>, i32 } @vploadff_v32i64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v32i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: bltu a2, a3, .LBB24_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: .LBB24_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a1), v0.t
+; CHECK-NEXT: csrr a1, vl
+; CHECK-NEXT: sw a1, 256(a0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v8, (a0)
+; CHECK-NEXT: ret
+ %load = call { <32 x i64>, i32 } @llvm.vp.load.ff.v32i64.p0(ptr %ptr, <32 x i1> %m, i32 %evl)
+ ret { <32 x i64>, i32 } %load
+}
+
+define { <32 x i64>, i32 } @vploadff_v32i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v32i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a3, 16
+; CHECK-NEXT: bltu a2, a3, .LBB25_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: .LBB25_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a1)
+; CHECK-NEXT: csrr a1, vl
+; CHECK-NEXT: sw a1, 256(a0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v8, (a0)
+; CHECK-NEXT: ret
+ %load = call { <32 x i64>, i32 } @llvm.vp.load.ff.v32i64.p0(ptr %ptr, <32 x i1> splat (i1 true), i32 %evl)
+ ret { <32 x i64>, i32 } %load
+}
+
+define { <2 x half>, i32 } @vploadff_v2f16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x half>, i32 } @llvm.vp.load.ff.v2f16.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x half>, i32 } %load
+}
+
+define { <2 x half>, i32 } @vploadff_v2f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x half>, i32 } @llvm.vp.load.ff.v2f16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x half>, i32 } %load
+}
+
+define { <4 x half>, i32 } @vploadff_v4f16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x half>, i32 } @llvm.vp.load.ff.v4f16.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x half>, i32 } %load
+}
+
+define { <4 x half>, i32 } @vploadff_v4f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x half>, i32 } @llvm.vp.load.ff.v4f16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x half>, i32 } %load
+}
+
+define { <8 x half>, i32 } @vploadff_v8f16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x half>, i32 } @llvm.vp.load.ff.v8f16.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x half>, i32 } %load
+}
+
+define { <8 x half>, i32 } @vploadff_v8f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x half>, i32 } @llvm.vp.load.ff.v8f16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x half>, i32 } %load
+}
+
+define { <2 x float>, i32 } @vploadff_v2f32(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x float>, i32 } @llvm.vp.load.ff.v2f32.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x float>, i32 } %load
+}
+
+define { <2 x float>, i32 } @vploadff_v2f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x float>, i32 } @llvm.vp.load.ff.v2f32.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x float>, i32 } %load
+}
+
+define { <4 x float>, i32 } @vploadff_v4f32(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x float>, i32 } @llvm.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x float>, i32 } %load
+}
+
+define { <4 x float>, i32 } @vploadff_v4f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x float>, i32 } @llvm.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x float>, i32 } %load
+}
+
+define { <8 x float>, i32 } @vploadff_v8f32(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x float>, i32 } @llvm.vp.load.ff.v8f32.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x float>, i32 } %load
+}
+
+define { <8 x float>, i32 } @vploadff_v8f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x float>, i32 } @llvm.vp.load.ff.v8f32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x float>, i32 } %load
+}
+
+define { <2 x double>, i32 } @vploadff_v2f64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x double>, i32 } @llvm.vp.load.ff.v2f64.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x double>, i32 } %load
+}
+
+define { <2 x double>, i32 } @vploadff_v2f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x double>, i32 } @llvm.vp.load.ff.v2f64.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x double>, i32 } %load
+}
+
+define { <4 x double>, i32 } @vploadff_v4f64(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x double>, i32 } @llvm.vp.load.ff.v4f64.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x double>, i32 } %load
+}
+
+define { <4 x double>, i32 } @vploadff_v4f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x double>, i32 } @llvm.vp.load.ff.v4f64.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x double>, i32 } %load
+}
+
+define { <8 x double>, i32 } @vploadff_v8f64(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x double>, i32 } @llvm.vp.load.ff.v8f64.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x double>, i32 } %load
+}
+
+define { <8 x double>, i32 } @vploadff_v8f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x double>, i32 } @llvm.vp.load.ff.v8f64.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x double>, i32 } %load
+}
+
+define { <2 x bfloat>, i32 } @vploadff_v2bf16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x bfloat>, i32 } @llvm.vp.load.ff.v2bf16.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+ ret { <2 x bfloat>, i32 } %load
+}
+
+define { <2 x bfloat>, i32 } @vploadff_v2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v2bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <2 x bfloat>, i32 } @llvm.vp.load.ff.v2bf16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl)
+ ret { <2 x bfloat>, i32 } %load
+}
+
+define { <4 x bfloat>, i32 } @vploadff_v4bf16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x bfloat>, i32 } @llvm.vp.load.ff.v4bf16.p0(ptr %ptr, <4 x i1> %m, i32 %evl)
+ ret { <4 x bfloat>, i32 } %load
+}
+
+define { <4 x bfloat>, i32 } @vploadff_v4bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v4bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <4 x bfloat>, i32 } @llvm.vp.load.ff.v4bf16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl)
+ ret { <4 x bfloat>, i32 } %load
+}
+
+define { <8 x bfloat>, i32 } @vploadff_v8bf16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x bfloat>, i32 } @llvm.vp.load.ff.v8bf16.p0(ptr %ptr, <8 x i1> %m, i32 %evl)
+ ret { <8 x bfloat>, i32 } %load
+}
+
+define { <8 x bfloat>, i32 } @vploadff_v8bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v8bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <8 x bfloat>, i32 } @llvm.vp.load.ff.v8bf16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl)
+ ret { <8 x bfloat>, i32 } %load
+}
+
+define { <7 x i8>, i32 } @vploadff_v7i8(ptr %ptr, <7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_v7i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <7 x i8>, i32 } @llvm.vp.load.ff.v7i8.p0(ptr %ptr, <7 x i1> %m, i32 %evl)
+ ret { <7 x i8>, i32 } %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fp4-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fp4-bitcast.ll
new file mode 100644
index 0000000..ac3cd84
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fp4-bitcast.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mattr='+v' < %s | FileCheck %s
+
+define <2 x i8> @fp4(<4 x i4> %0) nounwind {
+; CHECK-LABEL: fp4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vslidedown.vi v9, v8, 1
+; CHECK-NEXT: vmv.x.s a1, v9
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: vslidedown.vi v8, v8, 3
+; CHECK-NEXT: andi a0, a0, 15
+; CHECK-NEXT: vmv.x.s a2, v9
+; CHECK-NEXT: andi a1, a1, 15
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: vmv.x.s a1, v8
+; CHECK-NEXT: andi a2, a2, 15
+; CHECK-NEXT: slli a1, a1, 12
+; CHECK-NEXT: slli a2, a2, 8
+; CHECK-NEXT: or a1, a2, a1
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: sh a0, 14(sp)
+; CHECK-NEXT: addi a0, sp, 14
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %2 = bitcast <4 x i4> %0 to <2 x i8>
+ ret <2 x i8> %2
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index f481f9c..9ef7f94 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -89,17 +89,17 @@ entry:
define <2 x i32> @ustest_f64i32(<2 x double> %x) {
; CHECK-NOV-LABEL: ustest_f64i32:
; CHECK-NOV: # %bb.0: # %entry
-; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz
+; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz
; CHECK-NOV-NEXT: li a2, -1
; CHECK-NOV-NEXT: srli a2, a2, 32
-; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz
-; CHECK-NOV-NEXT: blt a1, a2, .LBB2_2
+; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz
+; CHECK-NOV-NEXT: blt a0, a2, .LBB2_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: mv a1, a2
+; CHECK-NOV-NEXT: mv a0, a2
; CHECK-NOV-NEXT: .LBB2_2: # %entry
-; CHECK-NOV-NEXT: blt a0, a2, .LBB2_4
+; CHECK-NOV-NEXT: blt a1, a2, .LBB2_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: mv a1, a2
; CHECK-NOV-NEXT: .LBB2_4: # %entry
; CHECK-NOV-NEXT: sgtz a2, a1
; CHECK-NOV-NEXT: sgtz a3, a0
@@ -254,50 +254,50 @@ entry:
define <4 x i32> @ustest_f32i32(<4 x float> %x) {
; CHECK-NOV-LABEL: ustest_f32i32:
; CHECK-NOV: # %bb.0: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz
-; CHECK-NOV-NEXT: li a4, -1
-; CHECK-NOV-NEXT: srli a4, a4, 32
-; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz
-; CHECK-NOV-NEXT: bge a1, a4, .LBB5_6
+; CHECK-NOV-NEXT: fcvt.l.s a1, fa0, rtz
+; CHECK-NOV-NEXT: li a5, -1
+; CHECK-NOV-NEXT: srli a5, a5, 32
+; CHECK-NOV-NEXT: fcvt.l.s a2, fa1, rtz
+; CHECK-NOV-NEXT: bge a1, a5, .LBB5_6
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz
-; CHECK-NOV-NEXT: bge a2, a4, .LBB5_7
+; CHECK-NOV-NEXT: fcvt.l.s a3, fa2, rtz
+; CHECK-NOV-NEXT: bge a2, a5, .LBB5_7
; CHECK-NOV-NEXT: .LBB5_2: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz
-; CHECK-NOV-NEXT: bge a3, a4, .LBB5_8
+; CHECK-NOV-NEXT: fcvt.l.s a4, fa3, rtz
+; CHECK-NOV-NEXT: bge a3, a5, .LBB5_8
; CHECK-NOV-NEXT: .LBB5_3: # %entry
-; CHECK-NOV-NEXT: blt a5, a4, .LBB5_5
+; CHECK-NOV-NEXT: blt a4, a5, .LBB5_5
; CHECK-NOV-NEXT: .LBB5_4: # %entry
-; CHECK-NOV-NEXT: mv a5, a4
+; CHECK-NOV-NEXT: mv a4, a5
; CHECK-NOV-NEXT: .LBB5_5: # %entry
-; CHECK-NOV-NEXT: sgtz a4, a1
-; CHECK-NOV-NEXT: sgtz a6, a2
-; CHECK-NOV-NEXT: sgtz a7, a3
-; CHECK-NOV-NEXT: sgtz t0, a5
+; CHECK-NOV-NEXT: sgtz a5, a4
+; CHECK-NOV-NEXT: sgtz a6, a3
+; CHECK-NOV-NEXT: sgtz a7, a2
+; CHECK-NOV-NEXT: sgtz t0, a1
; CHECK-NOV-NEXT: neg t0, t0
; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: neg a6, a6
-; CHECK-NOV-NEXT: neg a4, a4
-; CHECK-NOV-NEXT: and a5, t0, a5
-; CHECK-NOV-NEXT: and a3, a7, a3
-; CHECK-NOV-NEXT: and a2, a6, a2
-; CHECK-NOV-NEXT: and a1, a4, a1
-; CHECK-NOV-NEXT: sw a5, 0(a0)
-; CHECK-NOV-NEXT: sw a3, 4(a0)
-; CHECK-NOV-NEXT: sw a2, 8(a0)
-; CHECK-NOV-NEXT: sw a1, 12(a0)
+; CHECK-NOV-NEXT: neg a5, a5
+; CHECK-NOV-NEXT: and a1, t0, a1
+; CHECK-NOV-NEXT: and a2, a7, a2
+; CHECK-NOV-NEXT: and a3, a6, a3
+; CHECK-NOV-NEXT: and a4, a5, a4
+; CHECK-NOV-NEXT: sw a1, 0(a0)
+; CHECK-NOV-NEXT: sw a2, 4(a0)
+; CHECK-NOV-NEXT: sw a3, 8(a0)
+; CHECK-NOV-NEXT: sw a4, 12(a0)
; CHECK-NOV-NEXT: ret
; CHECK-NOV-NEXT: .LBB5_6: # %entry
-; CHECK-NOV-NEXT: mv a1, a4
-; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz
-; CHECK-NOV-NEXT: blt a2, a4, .LBB5_2
+; CHECK-NOV-NEXT: mv a1, a5
+; CHECK-NOV-NEXT: fcvt.l.s a3, fa2, rtz
+; CHECK-NOV-NEXT: blt a2, a5, .LBB5_2
; CHECK-NOV-NEXT: .LBB5_7: # %entry
-; CHECK-NOV-NEXT: mv a2, a4
-; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz
-; CHECK-NOV-NEXT: blt a3, a4, .LBB5_3
+; CHECK-NOV-NEXT: mv a2, a5
+; CHECK-NOV-NEXT: fcvt.l.s a4, fa3, rtz
+; CHECK-NOV-NEXT: blt a3, a5, .LBB5_3
; CHECK-NOV-NEXT: .LBB5_8: # %entry
-; CHECK-NOV-NEXT: mv a3, a4
-; CHECK-NOV-NEXT: bge a5, a4, .LBB5_4
+; CHECK-NOV-NEXT: mv a3, a5
+; CHECK-NOV-NEXT: bge a4, a5, .LBB5_4
; CHECK-NOV-NEXT: j .LBB5_5
;
; CHECK-V-LABEL: ustest_f32i32:
@@ -720,8 +720,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NOV-NEXT: .cfi_offset fs2, -64
; CHECK-NOV-NEXT: .cfi_remember_state
; CHECK-NOV-NEXT: lhu s1, 0(a1)
-; CHECK-NOV-NEXT: lhu s2, 8(a1)
-; CHECK-NOV-NEXT: lhu a2, 16(a1)
+; CHECK-NOV-NEXT: lhu a2, 8(a1)
+; CHECK-NOV-NEXT: lhu s2, 16(a1)
; CHECK-NOV-NEXT: lhu s3, 24(a1)
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: fmv.w.x fa0, a2
@@ -730,43 +730,43 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NOV-NEXT: fmv.w.x fa0, s2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs1, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s1
+; CHECK-NOV-NEXT: fmv.w.x fa0, s3
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs0, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s3
+; CHECK-NOV-NEXT: fmv.w.x fa0, s1
; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-NOV-NEXT: li a2, -1
-; CHECK-NOV-NEXT: srli a2, a2, 32
-; CHECK-NOV-NEXT: bge a0, a2, .LBB8_6
+; CHECK-NOV-NEXT: li a3, -1
+; CHECK-NOV-NEXT: srli a3, a3, 32
+; CHECK-NOV-NEXT: bge a0, a3, .LBB8_6
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz
-; CHECK-NOV-NEXT: bge s1, a2, .LBB8_7
+; CHECK-NOV-NEXT: bge s1, a3, .LBB8_7
; CHECK-NOV-NEXT: .LBB8_2: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz
-; CHECK-NOV-NEXT: bge a1, a2, .LBB8_8
+; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz
+; CHECK-NOV-NEXT: bge a1, a3, .LBB8_8
; CHECK-NOV-NEXT: .LBB8_3: # %entry
-; CHECK-NOV-NEXT: blt a3, a2, .LBB8_5
+; CHECK-NOV-NEXT: blt a2, a3, .LBB8_5
; CHECK-NOV-NEXT: .LBB8_4: # %entry
-; CHECK-NOV-NEXT: mv a3, a2
+; CHECK-NOV-NEXT: mv a2, a3
; CHECK-NOV-NEXT: .LBB8_5: # %entry
-; CHECK-NOV-NEXT: sgtz a2, a0
-; CHECK-NOV-NEXT: sgtz a4, s1
-; CHECK-NOV-NEXT: sgtz a5, a1
-; CHECK-NOV-NEXT: sgtz a6, a3
+; CHECK-NOV-NEXT: sgtz a3, a2
+; CHECK-NOV-NEXT: sgtz a4, a1
+; CHECK-NOV-NEXT: sgtz a5, s1
+; CHECK-NOV-NEXT: sgtz a6, a0
; CHECK-NOV-NEXT: neg a6, a6
; CHECK-NOV-NEXT: neg a5, a5
; CHECK-NOV-NEXT: neg a4, a4
-; CHECK-NOV-NEXT: neg a2, a2
-; CHECK-NOV-NEXT: and a3, a6, a3
-; CHECK-NOV-NEXT: and a1, a5, a1
-; CHECK-NOV-NEXT: and a4, a4, s1
-; CHECK-NOV-NEXT: and a0, a2, a0
-; CHECK-NOV-NEXT: sw a3, 0(s0)
-; CHECK-NOV-NEXT: sw a1, 4(s0)
-; CHECK-NOV-NEXT: sw a4, 8(s0)
-; CHECK-NOV-NEXT: sw a0, 12(s0)
+; CHECK-NOV-NEXT: neg a3, a3
+; CHECK-NOV-NEXT: and a0, a6, a0
+; CHECK-NOV-NEXT: and a5, a5, s1
+; CHECK-NOV-NEXT: and a1, a4, a1
+; CHECK-NOV-NEXT: and a2, a3, a2
+; CHECK-NOV-NEXT: sw a0, 0(s0)
+; CHECK-NOV-NEXT: sw a5, 4(s0)
+; CHECK-NOV-NEXT: sw a1, 8(s0)
+; CHECK-NOV-NEXT: sw a2, 12(s0)
; CHECK-NOV-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
@@ -788,16 +788,16 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
; CHECK-NOV-NEXT: ret
; CHECK-NOV-NEXT: .LBB8_6: # %entry
; CHECK-NOV-NEXT: .cfi_restore_state
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: mv a0, a3
; CHECK-NOV-NEXT: fcvt.l.s a1, fs1, rtz
-; CHECK-NOV-NEXT: blt s1, a2, .LBB8_2
+; CHECK-NOV-NEXT: blt s1, a3, .LBB8_2
; CHECK-NOV-NEXT: .LBB8_7: # %entry
-; CHECK-NOV-NEXT: mv s1, a2
-; CHECK-NOV-NEXT: fcvt.l.s a3, fs0, rtz
-; CHECK-NOV-NEXT: blt a1, a2, .LBB8_3
+; CHECK-NOV-NEXT: mv s1, a3
+; CHECK-NOV-NEXT: fcvt.l.s a2, fs0, rtz
+; CHECK-NOV-NEXT: blt a1, a3, .LBB8_3
; CHECK-NOV-NEXT: .LBB8_8: # %entry
-; CHECK-NOV-NEXT: mv a1, a2
-; CHECK-NOV-NEXT: bge a3, a2, .LBB8_4
+; CHECK-NOV-NEXT: mv a1, a3
+; CHECK-NOV-NEXT: bge a2, a3, .LBB8_4
; CHECK-NOV-NEXT: j .LBB8_5
;
; CHECK-V-LABEL: ustest_f16i32:
@@ -977,17 +977,17 @@ entry:
define <2 x i16> @ustest_f64i16(<2 x double> %x) {
; CHECK-NOV-LABEL: ustest_f64i16:
; CHECK-NOV: # %bb.0: # %entry
-; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz
+; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz
; CHECK-NOV-NEXT: lui a2, 16
; CHECK-NOV-NEXT: addi a2, a2, -1
-; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz
-; CHECK-NOV-NEXT: blt a1, a2, .LBB11_2
+; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz
+; CHECK-NOV-NEXT: blt a0, a2, .LBB11_2
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: mv a1, a2
+; CHECK-NOV-NEXT: mv a0, a2
; CHECK-NOV-NEXT: .LBB11_2: # %entry
-; CHECK-NOV-NEXT: blt a0, a2, .LBB11_4
+; CHECK-NOV-NEXT: blt a1, a2, .LBB11_4
; CHECK-NOV-NEXT: # %bb.3: # %entry
-; CHECK-NOV-NEXT: mv a0, a2
+; CHECK-NOV-NEXT: mv a1, a2
; CHECK-NOV-NEXT: .LBB11_4: # %entry
; CHECK-NOV-NEXT: sgtz a2, a1
; CHECK-NOV-NEXT: sgtz a3, a0
@@ -1146,50 +1146,50 @@ entry:
define <4 x i16> @ustest_f32i16(<4 x float> %x) {
; CHECK-NOV-LABEL: ustest_f32i16:
; CHECK-NOV: # %bb.0: # %entry
-; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz
-; CHECK-NOV-NEXT: lui a4, 16
-; CHECK-NOV-NEXT: addi a4, a4, -1
-; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz
-; CHECK-NOV-NEXT: bge a1, a4, .LBB14_6
+; CHECK-NOV-NEXT: fcvt.w.s a1, fa0, rtz
+; CHECK-NOV-NEXT: lui a5, 16
+; CHECK-NOV-NEXT: addi a5, a5, -1
+; CHECK-NOV-NEXT: fcvt.w.s a2, fa1, rtz
+; CHECK-NOV-NEXT: bge a1, a5, .LBB14_6
; CHECK-NOV-NEXT: # %bb.1: # %entry
-; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz
-; CHECK-NOV-NEXT: bge a2, a4, .LBB14_7
+; CHECK-NOV-NEXT: fcvt.w.s a3, fa2, rtz
+; CHECK-NOV-NEXT: bge a2, a5, .LBB14_7
; CHECK-NOV-NEXT: .LBB14_2: # %entry
-; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz
-; CHECK-NOV-NEXT: bge a3, a4, .LBB14_8
+; CHECK-NOV-NEXT: fcvt.w.s a4, fa3, rtz
+; CHECK-NOV-NEXT: bge a3, a5, .LBB14_8
; CHECK-NOV-NEXT: .LBB14_3: # %entry
-; CHECK-NOV-NEXT: blt a5, a4, .LBB14_5
+; CHECK-NOV-NEXT: blt a4, a5, .LBB14_5
; CHECK-NOV-NEXT: .LBB14_4: # %entry
-; CHECK-NOV-NEXT: mv a5, a4
+; CHECK-NOV-NEXT: mv a4, a5
; CHECK-NOV-NEXT: .LBB14_5: # %entry
-; CHECK-NOV-NEXT: sgtz a4, a1
-; CHECK-NOV-NEXT: sgtz a6, a2
-; CHECK-NOV-NEXT: sgtz a7, a3
-; CHECK-NOV-NEXT: sgtz t0, a5
+; CHECK-NOV-NEXT: sgtz a5, a4
+; CHECK-NOV-NEXT: sgtz a6, a3
+; CHECK-NOV-NEXT: sgtz a7, a2
+; CHECK-NOV-NEXT: sgtz t0, a1
; CHECK-NOV-NEXT: neg t0, t0
; CHECK-NOV-NEXT: neg a7, a7
; CHECK-NOV-NEXT: neg a6, a6
-; CHECK-NOV-NEXT: neg a4, a4
-; CHECK-NOV-NEXT: and a5, t0, a5
-; CHECK-NOV-NEXT: and a3, a7, a3
-; CHECK-NOV-NEXT: and a2, a6, a2
-; CHECK-NOV-NEXT: and a1, a4, a1
-; CHECK-NOV-NEXT: sh a5, 0(a0)
-; CHECK-NOV-NEXT: sh a3, 2(a0)
-; CHECK-NOV-NEXT: sh a2, 4(a0)
-; CHECK-NOV-NEXT: sh a1, 6(a0)
+; CHECK-NOV-NEXT: neg a5, a5
+; CHECK-NOV-NEXT: and a1, t0, a1
+; CHECK-NOV-NEXT: and a2, a7, a2
+; CHECK-NOV-NEXT: and a3, a6, a3
+; CHECK-NOV-NEXT: and a4, a5, a4
+; CHECK-NOV-NEXT: sh a1, 0(a0)
+; CHECK-NOV-NEXT: sh a2, 2(a0)
+; CHECK-NOV-NEXT: sh a3, 4(a0)
+; CHECK-NOV-NEXT: sh a4, 6(a0)
; CHECK-NOV-NEXT: ret
; CHECK-NOV-NEXT: .LBB14_6: # %entry
-; CHECK-NOV-NEXT: mv a1, a4
-; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz
-; CHECK-NOV-NEXT: blt a2, a4, .LBB14_2
+; CHECK-NOV-NEXT: mv a1, a5
+; CHECK-NOV-NEXT: fcvt.w.s a3, fa2, rtz
+; CHECK-NOV-NEXT: blt a2, a5, .LBB14_2
; CHECK-NOV-NEXT: .LBB14_7: # %entry
-; CHECK-NOV-NEXT: mv a2, a4
-; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz
-; CHECK-NOV-NEXT: blt a3, a4, .LBB14_3
+; CHECK-NOV-NEXT: mv a2, a5
+; CHECK-NOV-NEXT: fcvt.w.s a4, fa3, rtz
+; CHECK-NOV-NEXT: blt a3, a5, .LBB14_3
; CHECK-NOV-NEXT: .LBB14_8: # %entry
-; CHECK-NOV-NEXT: mv a3, a4
-; CHECK-NOV-NEXT: bge a5, a4, .LBB14_4
+; CHECK-NOV-NEXT: mv a3, a5
+; CHECK-NOV-NEXT: bge a4, a5, .LBB14_4
; CHECK-NOV-NEXT: j .LBB14_5
;
; CHECK-V-LABEL: ustest_f32i16:
@@ -1974,72 +1974,72 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-NOV-NEXT: .cfi_remember_state
; CHECK-NOV-NEXT: lhu s1, 32(a1)
; CHECK-NOV-NEXT: lhu s2, 40(a1)
-; CHECK-NOV-NEXT: lhu a2, 48(a1)
-; CHECK-NOV-NEXT: lhu s3, 56(a1)
-; CHECK-NOV-NEXT: lhu s4, 0(a1)
-; CHECK-NOV-NEXT: lhu s5, 8(a1)
+; CHECK-NOV-NEXT: lhu s3, 48(a1)
+; CHECK-NOV-NEXT: lhu s4, 56(a1)
+; CHECK-NOV-NEXT: lhu s5, 0(a1)
+; CHECK-NOV-NEXT: lhu a2, 8(a1)
; CHECK-NOV-NEXT: lhu s6, 16(a1)
; CHECK-NOV-NEXT: lhu s7, 24(a1)
; CHECK-NOV-NEXT: mv s0, a0
; CHECK-NOV-NEXT: fmv.w.x fa0, a2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs6, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s2
+; CHECK-NOV-NEXT: fmv.w.x fa0, s6
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs5, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s1
+; CHECK-NOV-NEXT: fmv.w.x fa0, s7
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs4, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s7
+; CHECK-NOV-NEXT: fmv.w.x fa0, s1
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs3, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s6
+; CHECK-NOV-NEXT: fmv.w.x fa0, s2
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs2, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s5
+; CHECK-NOV-NEXT: fmv.w.x fa0, s3
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs1, fa0
; CHECK-NOV-NEXT: fmv.w.x fa0, s4
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fmv.s fs0, fa0
-; CHECK-NOV-NEXT: fmv.w.x fa0, s3
+; CHECK-NOV-NEXT: fmv.w.x fa0, s5
; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz
; CHECK-NOV-NEXT: call __extendhfsf2
; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz
-; CHECK-NOV-NEXT: lui a4, 16
-; CHECK-NOV-NEXT: addi a4, a4, -1
-; CHECK-NOV-NEXT: bge a0, a4, .LBB17_10
+; CHECK-NOV-NEXT: lui a5, 16
+; CHECK-NOV-NEXT: addi a5, a5, -1
+; CHECK-NOV-NEXT: bge a0, a5, .LBB17_10
; CHECK-NOV-NEXT: # %bb.1: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz
-; CHECK-NOV-NEXT: bge s1, a4, .LBB17_11
+; CHECK-NOV-NEXT: bge s1, a5, .LBB17_11
; CHECK-NOV-NEXT: .LBB17_2: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz
-; CHECK-NOV-NEXT: bge a1, a4, .LBB17_12
+; CHECK-NOV-NEXT: bge a1, a5, .LBB17_12
; CHECK-NOV-NEXT: .LBB17_3: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz
-; CHECK-NOV-NEXT: bge a2, a4, .LBB17_13
+; CHECK-NOV-NEXT: bge a2, a5, .LBB17_13
; CHECK-NOV-NEXT: .LBB17_4: # %entry
-; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz
-; CHECK-NOV-NEXT: bge a3, a4, .LBB17_14
+; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz
+; CHECK-NOV-NEXT: bge a3, a5, .LBB17_14
; CHECK-NOV-NEXT: .LBB17_5: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz
-; CHECK-NOV-NEXT: bge a5, a4, .LBB17_15
+; CHECK-NOV-NEXT: bge a4, a5, .LBB17_15
; CHECK-NOV-NEXT: .LBB17_6: # %entry
; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz
-; CHECK-NOV-NEXT: bge a6, a4, .LBB17_16
+; CHECK-NOV-NEXT: bge a6, a5, .LBB17_16
; CHECK-NOV-NEXT: .LBB17_7: # %entry
-; CHECK-NOV-NEXT: blt a7, a4, .LBB17_9
+; CHECK-NOV-NEXT: blt a7, a5, .LBB17_9
; CHECK-NOV-NEXT: .LBB17_8: # %entry
-; CHECK-NOV-NEXT: mv a7, a4
+; CHECK-NOV-NEXT: mv a7, a5
; CHECK-NOV-NEXT: .LBB17_9: # %entry
-; CHECK-NOV-NEXT: sgtz a4, a0
-; CHECK-NOV-NEXT: sgtz t0, s1
-; CHECK-NOV-NEXT: sgtz t1, a1
-; CHECK-NOV-NEXT: sgtz t2, a2
-; CHECK-NOV-NEXT: sgtz t3, a3
-; CHECK-NOV-NEXT: sgtz t4, a5
-; CHECK-NOV-NEXT: sgtz t5, a6
-; CHECK-NOV-NEXT: sgtz t6, a7
+; CHECK-NOV-NEXT: sgtz a5, a7
+; CHECK-NOV-NEXT: sgtz t0, a6
+; CHECK-NOV-NEXT: sgtz t1, a4
+; CHECK-NOV-NEXT: sgtz t2, a3
+; CHECK-NOV-NEXT: sgtz t3, a2
+; CHECK-NOV-NEXT: sgtz t4, a1
+; CHECK-NOV-NEXT: sgtz t5, s1
+; CHECK-NOV-NEXT: sgtz t6, a0
; CHECK-NOV-NEXT: neg t6, t6
; CHECK-NOV-NEXT: neg t5, t5
; CHECK-NOV-NEXT: neg t4, t4
@@ -2047,23 +2047,23 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-NOV-NEXT: neg t2, t2
; CHECK-NOV-NEXT: neg t1, t1
; CHECK-NOV-NEXT: neg t0, t0
-; CHECK-NOV-NEXT: neg a4, a4
-; CHECK-NOV-NEXT: and a7, t6, a7
-; CHECK-NOV-NEXT: and a6, t5, a6
-; CHECK-NOV-NEXT: and a5, t4, a5
-; CHECK-NOV-NEXT: and a3, t3, a3
-; CHECK-NOV-NEXT: and a2, t2, a2
-; CHECK-NOV-NEXT: and a1, t1, a1
-; CHECK-NOV-NEXT: and t0, t0, s1
-; CHECK-NOV-NEXT: and a0, a4, a0
-; CHECK-NOV-NEXT: sh a2, 8(s0)
-; CHECK-NOV-NEXT: sh a1, 10(s0)
-; CHECK-NOV-NEXT: sh t0, 12(s0)
-; CHECK-NOV-NEXT: sh a0, 14(s0)
-; CHECK-NOV-NEXT: sh a7, 0(s0)
-; CHECK-NOV-NEXT: sh a6, 2(s0)
-; CHECK-NOV-NEXT: sh a5, 4(s0)
-; CHECK-NOV-NEXT: sh a3, 6(s0)
+; CHECK-NOV-NEXT: neg a5, a5
+; CHECK-NOV-NEXT: and a0, t6, a0
+; CHECK-NOV-NEXT: and t5, t5, s1
+; CHECK-NOV-NEXT: and a1, t4, a1
+; CHECK-NOV-NEXT: and a2, t3, a2
+; CHECK-NOV-NEXT: and a3, t2, a3
+; CHECK-NOV-NEXT: and a4, t1, a4
+; CHECK-NOV-NEXT: and a6, t0, a6
+; CHECK-NOV-NEXT: and a5, a5, a7
+; CHECK-NOV-NEXT: sh a3, 8(s0)
+; CHECK-NOV-NEXT: sh a4, 10(s0)
+; CHECK-NOV-NEXT: sh a6, 12(s0)
+; CHECK-NOV-NEXT: sh a5, 14(s0)
+; CHECK-NOV-NEXT: sh a0, 0(s0)
+; CHECK-NOV-NEXT: sh t5, 2(s0)
+; CHECK-NOV-NEXT: sh a1, 4(s0)
+; CHECK-NOV-NEXT: sh a2, 6(s0)
; CHECK-NOV-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
; CHECK-NOV-NEXT: ld s1, 104(sp) # 8-byte Folded Reload
@@ -2101,32 +2101,32 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
; CHECK-NOV-NEXT: ret
; CHECK-NOV-NEXT: .LBB17_10: # %entry
; CHECK-NOV-NEXT: .cfi_restore_state
-; CHECK-NOV-NEXT: mv a0, a4
+; CHECK-NOV-NEXT: mv a0, a5
; CHECK-NOV-NEXT: fcvt.l.s a1, fs5, rtz
-; CHECK-NOV-NEXT: blt s1, a4, .LBB17_2
+; CHECK-NOV-NEXT: blt s1, a5, .LBB17_2
; CHECK-NOV-NEXT: .LBB17_11: # %entry
-; CHECK-NOV-NEXT: mv s1, a4
+; CHECK-NOV-NEXT: mv s1, a5
; CHECK-NOV-NEXT: fcvt.l.s a2, fs4, rtz
-; CHECK-NOV-NEXT: blt a1, a4, .LBB17_3
+; CHECK-NOV-NEXT: blt a1, a5, .LBB17_3
; CHECK-NOV-NEXT: .LBB17_12: # %entry
-; CHECK-NOV-NEXT: mv a1, a4
+; CHECK-NOV-NEXT: mv a1, a5
; CHECK-NOV-NEXT: fcvt.l.s a3, fs3, rtz
-; CHECK-NOV-NEXT: blt a2, a4, .LBB17_4
+; CHECK-NOV-NEXT: blt a2, a5, .LBB17_4
; CHECK-NOV-NEXT: .LBB17_13: # %entry
-; CHECK-NOV-NEXT: mv a2, a4
-; CHECK-NOV-NEXT: fcvt.l.s a5, fs2, rtz
-; CHECK-NOV-NEXT: blt a3, a4, .LBB17_5
+; CHECK-NOV-NEXT: mv a2, a5
+; CHECK-NOV-NEXT: fcvt.l.s a4, fs2, rtz
+; CHECK-NOV-NEXT: blt a3, a5, .LBB17_5
; CHECK-NOV-NEXT: .LBB17_14: # %entry
-; CHECK-NOV-NEXT: mv a3, a4
+; CHECK-NOV-NEXT: mv a3, a5
; CHECK-NOV-NEXT: fcvt.l.s a6, fs1, rtz
-; CHECK-NOV-NEXT: blt a5, a4, .LBB17_6
+; CHECK-NOV-NEXT: blt a4, a5, .LBB17_6
; CHECK-NOV-NEXT: .LBB17_15: # %entry
-; CHECK-NOV-NEXT: mv a5, a4
+; CHECK-NOV-NEXT: mv a4, a5
; CHECK-NOV-NEXT: fcvt.l.s a7, fs0, rtz
-; CHECK-NOV-NEXT: blt a6, a4, .LBB17_7
+; CHECK-NOV-NEXT: blt a6, a5, .LBB17_7
; CHECK-NOV-NEXT: .LBB17_16: # %entry
-; CHECK-NOV-NEXT: mv a6, a4
-; CHECK-NOV-NEXT: bge a7, a4, .LBB17_8
+; CHECK-NOV-NEXT: mv a6, a5
+; CHECK-NOV-NEXT: bge a7, a5, .LBB17_8
; CHECK-NOV-NEXT: j .LBB17_9
;
; CHECK-V-LABEL: ustest_f16i16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 346e40a..02825b2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -5427,18 +5427,18 @@ for.cond.cleanup: ; preds = %vector.body
define void @sink_splat_select_op1(ptr nocapture %a, i32 signext %x) {
; CHECK-LABEL: sink_splat_select_op1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a2, 1
-; CHECK-NEXT: add a2, a0, a2
-; CHECK-NEXT: li a3, 42
+; CHECK-NEXT: lui a3, 1
+; CHECK-NEXT: li a2, 42
+; CHECK-NEXT: add a3, a0, a3
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a1
; CHECK-NEXT: .LBB117_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vmseq.vx v0, v8, a3
-; CHECK-NEXT: vmerge.vxm v8, v8, a1, v0
-; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: vle32.v v9, (a0)
+; CHECK-NEXT: vmseq.vx v0, v9, a2
+; CHECK-NEXT: vse32.v v8, (a0), v0.t
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a2, .LBB117_1
+; CHECK-NEXT: bne a0, a3, .LBB117_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -5472,9 +5472,8 @@ define void @sink_splat_select_op2(ptr nocapture %a, i32 signext %x) {
; CHECK-NEXT: .LBB118_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v9, (a0)
-; CHECK-NEXT: vmseq.vx v0, v9, a2
-; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
-; CHECK-NEXT: vse32.v v9, (a0)
+; CHECK-NEXT: vmsne.vx v0, v9, a2
+; CHECK-NEXT: vse32.v v8, (a0), v0.t
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: bne a0, a1, .LBB118_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index 45f158f..09f42ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -443,8 +443,8 @@ define <vscale x 1 x i64> @straightline_offset_add(ptr %p, i64 %offset) {
ret <vscale x 1 x i64> %x
}
-define <vscale x 1 x i64> @straightline_offset_disjoint_or(ptr %p, i64 %offset) {
-; CHECK-LABEL: @straightline_offset_disjoint_or(
+define <vscale x 1 x i64> @straightline_offset_disjoint_or_1(ptr %p) {
+; CHECK-LABEL: @straightline_offset_disjoint_or_1(
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP1]], i64 8, <vscale x 1 x i1> splat (i1 true), i32 [[TMP2]])
@@ -464,6 +464,33 @@ define <vscale x 1 x i64> @straightline_offset_disjoint_or(ptr %p, i64 %offset)
ret <vscale x 1 x i64> %x
}
+define <vscale x 1 x i64> @straightline_offset_disjoint_or(ptr %p, i1 %offset) {
+; CHECK-LABEL: @straightline_offset_disjoint_or(
+; CHECK-NEXT: [[AND:%.*]] = zext i1 [[OFFSET:%.*]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 4, [[AND]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP1]], i64 8, <vscale x 1 x i1> splat (i1 true), i32 [[TMP2]])
+; CHECK-NEXT: [[X:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i64> poison, i32 [[TMP2]])
+; CHECK-NEXT: ret <vscale x 1 x i64> [[X]]
+;
+ %step = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+ %step.shl = shl <vscale x 1 x i64> %step, splat (i64 1)
+ %add = add <vscale x 1 x i64> %step.shl, splat (i64 4)
+ %zext = zext i1 %offset to i64
+ %splat.insert = insertelement <vscale x 1 x i64> poison, i64 %zext, i64 0
+ %splat = shufflevector <vscale x 1 x i64> %splat.insert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %offsetv = or disjoint <vscale x 1 x i64> %add, %splat
+ %ptrs = getelementptr i32, ptr %p, <vscale x 1 x i64> %offsetv
+ %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
+ <vscale x 1 x ptr> %ptrs,
+ i32 8,
+ <vscale x 1 x i1> splat (i1 true),
+ <vscale x 1 x i64> poison
+ )
+ ret <vscale x 1 x i64> %x
+}
+
define <vscale x 1 x i64> @straightline_offset_shl(ptr %p) {
; CHECK-LABEL: @straightline_offset_shl(
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index d0b184b..afe918b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -13,22 +13,22 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV32: # %bb.0:
; RV32-NEXT: lw a0, 0(a0)
; RV32-NEXT: srli a2, a0, 16
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: slli a4, a0, 24
-; RV32-NEXT: slli a5, a0, 8
-; RV32-NEXT: srli a6, a3, 24
-; RV32-NEXT: srai a3, a3, 24
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: slli a4, a0, 16
+; RV32-NEXT: slli a5, a0, 24
+; RV32-NEXT: slli a6, a0, 8
; RV32-NEXT: srai a4, a4, 24
; RV32-NEXT: srai a5, a5, 24
+; RV32-NEXT: srai a6, a6, 24
+; RV32-NEXT: sgtz a6, a6
; RV32-NEXT: sgtz a5, a5
; RV32-NEXT: sgtz a4, a4
-; RV32-NEXT: sgtz a3, a3
-; RV32-NEXT: neg a3, a3
; RV32-NEXT: neg a4, a4
; RV32-NEXT: neg a5, a5
-; RV32-NEXT: and a3, a3, a6
-; RV32-NEXT: and a0, a4, a0
-; RV32-NEXT: and a2, a5, a2
+; RV32-NEXT: neg a6, a6
+; RV32-NEXT: and a3, a4, a3
+; RV32-NEXT: and a0, a5, a0
+; RV32-NEXT: and a2, a6, a2
; RV32-NEXT: slli a3, a3, 8
; RV32-NEXT: zext.b a0, a0
; RV32-NEXT: or a0, a0, a3
@@ -39,23 +39,23 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV64-LABEL: vec3_setcc_crash:
; RV64: # %bb.0:
; RV64-NEXT: lw a0, 0(a0)
-; RV64-NEXT: srliw a2, a0, 16
-; RV64-NEXT: slli a3, a0, 48
-; RV64-NEXT: slli a4, a0, 56
-; RV64-NEXT: slli a5, a0, 40
-; RV64-NEXT: srli a6, a3, 56
-; RV64-NEXT: srai a3, a3, 56
+; RV64-NEXT: srli a2, a0, 16
+; RV64-NEXT: srli a3, a0, 8
+; RV64-NEXT: slli a4, a0, 48
+; RV64-NEXT: slli a5, a0, 56
+; RV64-NEXT: slli a6, a0, 40
; RV64-NEXT: srai a4, a4, 56
; RV64-NEXT: srai a5, a5, 56
+; RV64-NEXT: srai a6, a6, 56
+; RV64-NEXT: sgtz a6, a6
; RV64-NEXT: sgtz a5, a5
; RV64-NEXT: sgtz a4, a4
-; RV64-NEXT: sgtz a3, a3
-; RV64-NEXT: neg a3, a3
; RV64-NEXT: neg a4, a4
; RV64-NEXT: neg a5, a5
-; RV64-NEXT: and a3, a3, a6
-; RV64-NEXT: and a0, a4, a0
-; RV64-NEXT: and a2, a5, a2
+; RV64-NEXT: neg a6, a6
+; RV64-NEXT: and a3, a4, a3
+; RV64-NEXT: and a0, a5, a0
+; RV64-NEXT: and a2, a6, a2
; RV64-NEXT: slli a3, a3, 8
; RV64-NEXT: zext.b a0, a0
; RV64-NEXT: or a0, a0, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
index 5c1e41f..b83ddce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice.ll
@@ -470,61 +470,61 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
; CHECK-LABEL: test_vp_splice_nxv16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a4, vlenb
-; CHECK-NEXT: slli a5, a4, 1
-; CHECK-NEXT: addi a5, a5, -1
; CHECK-NEXT: slli a1, a4, 3
-; CHECK-NEXT: mv a7, a2
-; CHECK-NEXT: bltu a2, a5, .LBB22_2
+; CHECK-NEXT: slli a7, a4, 1
+; CHECK-NEXT: addi a7, a7, -1
+; CHECK-NEXT: add a5, a0, a1
+; CHECK-NEXT: mv a6, a2
+; CHECK-NEXT: bltu a2, a7, .LBB22_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a7, a5
+; CHECK-NEXT: mv a6, a7
; CHECK-NEXT: .LBB22_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a5, a5, 5
-; CHECK-NEXT: sub sp, sp, a5
+; CHECK-NEXT: csrr a7, vlenb
+; CHECK-NEXT: slli a7, a7, 5
+; CHECK-NEXT: sub sp, sp, a7
; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: add a5, a0, a1
-; CHECK-NEXT: slli a7, a7, 3
+; CHECK-NEXT: vl8re64.v v24, (a5)
+; CHECK-NEXT: slli a5, a6, 3
; CHECK-NEXT: addi a6, sp, 64
-; CHECK-NEXT: mv t0, a2
+; CHECK-NEXT: add a5, a6, a5
+; CHECK-NEXT: mv a7, a2
; CHECK-NEXT: bltu a2, a4, .LBB22_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv t0, a4
+; CHECK-NEXT: mv a7, a4
; CHECK-NEXT: .LBB22_4:
-; CHECK-NEXT: vl8re64.v v24, (a5)
-; CHECK-NEXT: add a5, a6, a7
; CHECK-NEXT: vl8re64.v v0, (a0)
-; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a6)
; CHECK-NEXT: sub a0, a2, a4
+; CHECK-NEXT: add a6, a6, a1
+; CHECK-NEXT: sub a7, a3, a4
; CHECK-NEXT: sltu a2, a2, a0
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a0, a2, a0
-; CHECK-NEXT: add a6, a6, a1
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: and a2, a2, a0
+; CHECK-NEXT: sltu a0, a3, a7
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a7
+; CHECK-NEXT: add a7, a5, a1
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v16, (a6)
-; CHECK-NEXT: mv a0, a3
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v24, (a7)
; CHECK-NEXT: bltu a3, a4, .LBB22_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a0, a4
+; CHECK-NEXT: mv a3, a4
; CHECK-NEXT: .LBB22_6:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v0, (a5)
-; CHECK-NEXT: sub a2, a3, a4
-; CHECK-NEXT: add a5, a5, a1
-; CHECK-NEXT: sltu a3, a3, a2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a2, a3, a2
-; CHECK-NEXT: addi a3, sp, 104
-; CHECK-NEXT: add a1, a3, a1
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v24, (a5)
-; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: addi a2, sp, 104
+; CHECK-NEXT: add a1, a2, a1
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v8, (a3)
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a2)
; CHECK-NEXT: addi sp, s0, -80
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
@@ -537,66 +537,66 @@ define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vs
define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) #0 {
; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: csrr a5, vlenb
-; CHECK-NEXT: slli a6, a5, 1
-; CHECK-NEXT: addi a6, a6, -1
-; CHECK-NEXT: slli a1, a5, 3
-; CHECK-NEXT: mv a4, a2
-; CHECK-NEXT: bltu a2, a6, .LBB23_2
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a1, a4, 3
+; CHECK-NEXT: slli a7, a4, 1
+; CHECK-NEXT: addi a7, a7, -1
+; CHECK-NEXT: add a5, a0, a1
+; CHECK-NEXT: mv a6, a2
+; CHECK-NEXT: bltu a2, a7, .LBB23_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a4, a6
+; CHECK-NEXT: mv a6, a7
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: addi sp, sp, -80
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 80
-; CHECK-NEXT: csrr a6, vlenb
-; CHECK-NEXT: slli a6, a6, 5
-; CHECK-NEXT: sub sp, sp, a6
+; CHECK-NEXT: csrr a7, vlenb
+; CHECK-NEXT: slli a7, a7, 5
+; CHECK-NEXT: sub sp, sp, a7
; CHECK-NEXT: andi sp, sp, -64
-; CHECK-NEXT: add a6, a0, a1
-; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: vl8re64.v v24, (a5)
+; CHECK-NEXT: slli a5, a6, 3
; CHECK-NEXT: addi a7, sp, 64
+; CHECK-NEXT: add a6, a7, a5
; CHECK-NEXT: mv t0, a2
-; CHECK-NEXT: bltu a2, a5, .LBB23_4
+; CHECK-NEXT: bltu a2, a4, .LBB23_4
; CHECK-NEXT: # %bb.3:
-; CHECK-NEXT: mv t0, a5
+; CHECK-NEXT: mv t0, a4
; CHECK-NEXT: .LBB23_4:
-; CHECK-NEXT: vl8re64.v v24, (a6)
-; CHECK-NEXT: add a6, a7, a4
; CHECK-NEXT: vl8re64.v v0, (a0)
; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a7)
-; CHECK-NEXT: sub a0, a2, a5
+; CHECK-NEXT: sub a0, a2, a4
+; CHECK-NEXT: add a7, a7, a1
+; CHECK-NEXT: sub t0, a3, a4
; CHECK-NEXT: sltu a2, a2, a0
; CHECK-NEXT: addi a2, a2, -1
-; CHECK-NEXT: and a0, a2, a0
-; CHECK-NEXT: add a7, a7, a1
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: and a2, a2, a0
+; CHECK-NEXT: sltu a0, a3, t0
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, t0
+; CHECK-NEXT: add t0, a6, a1
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v16, (a7)
-; CHECK-NEXT: mv a0, a3
-; CHECK-NEXT: bltu a3, a5, .LBB23_6
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vse64.v v24, (t0)
+; CHECK-NEXT: bltu a3, a4, .LBB23_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: mv a0, a5
+; CHECK-NEXT: mv a3, a4
; CHECK-NEXT: .LBB23_6:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: li a2, 8
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v0, (a6)
-; CHECK-NEXT: sub a2, a3, a5
-; CHECK-NEXT: add a5, a6, a1
-; CHECK-NEXT: sltu a3, a3, a2
-; CHECK-NEXT: addi a3, a3, -1
-; CHECK-NEXT: and a2, a3, a2
-; CHECK-NEXT: li a3, 8
-; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT: vse64.v v24, (a5)
-; CHECK-NEXT: bltu a4, a3, .LBB23_8
+; CHECK-NEXT: bltu a5, a2, .LBB23_8
; CHECK-NEXT: # %bb.7:
-; CHECK-NEXT: li a4, 8
+; CHECK-NEXT: li a5, 8
; CHECK-NEXT: .LBB23_8:
-; CHECK-NEXT: sub a2, a6, a4
+; CHECK-NEXT: sub a2, a6, a5
; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a2)
; CHECK-NEXT: addi sp, s0, -80
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vploadff.ll b/llvm/test/CodeGen/RISCV/rvv/vploadff.ll
new file mode 100644
index 0000000..9e08938
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vploadff.ll
@@ -0,0 +1,1008 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \
+; RUN: -verify-machineinstrs < %s | FileCheck %s
+
+define { <vscale x 1 x i8>, i32 } @vploadff_nxv1i8(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i8>, i32 } @llvm.vp.load.ff.nxv1i8.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x i8>, i32 } %load
+}
+
+define { <vscale x 1 x i8>, i32 } @vploadff_nxv1i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i8>, i32 } @llvm.vp.load.ff.nxv1i8.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x i8>, i32 } %load
+}
+
+define { <vscale x 2 x i8>, i32 } @vploadff_nxv2i8(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i8>, i32 } @llvm.vp.load.ff.nxv2i8.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x i8>, i32 } %load
+}
+
+define { <vscale x 2 x i8>, i32 } @vploadff_nxv2i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i8>, i32 } @llvm.vp.load.ff.nxv2i8.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x i8>, i32 } %load
+}
+
+define { <vscale x 4 x i8>, i32 } @vploadff_nxv4i8(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i8>, i32 } @llvm.vp.load.ff.nxv4i8.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x i8>, i32 } %load
+}
+
+define { <vscale x 4 x i8>, i32 } @vploadff_nxv4i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i8>, i32 } @llvm.vp.load.ff.nxv4i8.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x i8>, i32 } %load
+}
+
+define { <vscale x 8 x i8>, i32 } @vploadff_nxv8i8(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i8>, i32 } @llvm.vp.load.ff.nxv8i8.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x i8>, i32 } %load
+}
+
+define { <vscale x 8 x i8>, i32 } @vploadff_nxv8i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i8>, i32 } @llvm.vp.load.ff.nxv8i8.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x i8>, i32 } %load
+}
+
+define { <vscale x 16 x i8>, i32 } @vploadff_nxv16i8(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x i8>, i32 } %load
+}
+
+define { <vscale x 16 x i8>, i32 } @vploadff_nxv16i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x i8>, i32 } %load
+}
+
+define { <vscale x 32 x i8>, i32 } @vploadff_nxv32i8(ptr %ptr, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x i8>, i32 } @llvm.vp.load.ff.nxv32i8.p0(ptr %ptr, <vscale x 32 x i1> %m, i32 %evl)
+ ret { <vscale x 32 x i8>, i32 } %load
+}
+
+define { <vscale x 32 x i8>, i32 } @vploadff_nxv32i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x i8>, i32 } @llvm.vp.load.ff.nxv32i8.p0(ptr %ptr, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 32 x i8>, i32 } %load
+}
+
+define { <vscale x 64 x i8>, i32 } @vploadff_nxv64i8(ptr %ptr, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv64i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 64 x i8>, i32 } @llvm.vp.load.ff.nxv64i8.p0(ptr %ptr, <vscale x 64 x i1> %m, i32 %evl)
+ ret { <vscale x 64 x i8>, i32 } %load
+}
+
+define { <vscale x 64 x i8>, i32 } @vploadff_nxv64i8_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv64i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 64 x i8>, i32 } @llvm.vp.load.ff.nxv64i8.p0(ptr %ptr, <vscale x 64 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 64 x i8>, i32 } %load
+}
+
+define <vscale x 128 x i8> @vploadff_nxv128i8(ptr %ptr, ptr %evl_out, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv128i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: bltu a2, a3, .LBB14_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB14_2:
+; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: sw a0, 0(a1)
+; CHECK-NEXT: ret
+ %load = call { <vscale x 128 x i8>, i32 } @llvm.vp.load.ff.nxv128i8.p0(ptr %ptr, <vscale x 128 x i1> %m, i32 %evl)
+ %result0 = extractvalue { <vscale x 128 x i8>, i32 } %load, 0
+ %result1 = extractvalue { <vscale x 128 x i8>, i32 } %load, 1
+ store i32 %result1, ptr %evl_out
+ ret <vscale x 128 x i8> %result0
+}
+
+define <vscale x 128 x i8> @vploadff_nxv128i8_allones_mask(ptr %ptr, ptr %evl_out, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv128i8_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 3
+; CHECK-NEXT: bltu a2, a3, .LBB15_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: mv a2, a3
+; CHECK-NEXT: .LBB15_2:
+; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: sw a0, 0(a1)
+; CHECK-NEXT: ret
+ %load = call { <vscale x 128 x i8>, i32 } @llvm.vp.load.ff.nxv128i8.p0(ptr %ptr, <vscale x 128 x i1> splat (i1 true), i32 %evl)
+ %result0 = extractvalue { <vscale x 128 x i8>, i32 } %load, 0
+ %result1 = extractvalue { <vscale x 128 x i8>, i32 } %load, 1
+ store i32 %result1, ptr %evl_out
+ ret <vscale x 128 x i8> %result0
+}
+
+define { <vscale x 1 x i16>, i32 } @vploadff_nxv1i16(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i16>, i32 } @llvm.vp.load.ff.nxv1i16.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x i16>, i32 } %load
+}
+
+define { <vscale x 1 x i16>, i32 } @vploadff_nxv1i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i16>, i32 } @llvm.vp.load.ff.nxv1i16.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x i16>, i32 } %load
+}
+
+define { <vscale x 2 x i16>, i32 } @vploadff_nxv2i16(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i16>, i32 } @llvm.vp.load.ff.nxv2i16.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x i16>, i32 } %load
+}
+
+define { <vscale x 2 x i16>, i32 } @vploadff_nxv2i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i16>, i32 } @llvm.vp.load.ff.nxv2i16.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x i16>, i32 } %load
+}
+
+define { <vscale x 4 x i16>, i32 } @vploadff_nxv4i16(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i16>, i32 } @llvm.vp.load.ff.nxv4i16.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x i16>, i32 } %load
+}
+
+define { <vscale x 4 x i16>, i32 } @vploadff_nxv4i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i16>, i32 } @llvm.vp.load.ff.nxv4i16.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x i16>, i32 } %load
+}
+
+define { <vscale x 8 x i16>, i32 } @vploadff_nxv8i16(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i16>, i32 } @llvm.vp.load.ff.nxv8i16.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x i16>, i32 } %load
+}
+
+define { <vscale x 8 x i16>, i32 } @vploadff_nxv8i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i16>, i32 } @llvm.vp.load.ff.nxv8i16.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x i16>, i32 } %load
+}
+
+define { <vscale x 16 x i16>, i32 } @vploadff_nxv16i16(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i16>, i32 } @llvm.vp.load.ff.nxv16i16.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x i16>, i32 } %load
+}
+
+define { <vscale x 16 x i16>, i32 } @vploadff_nxv16i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i16>, i32 } @llvm.vp.load.ff.nxv16i16.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x i16>, i32 } %load
+}
+
+define { <vscale x 32 x i16>, i32 } @vploadff_nxv32i16(ptr %ptr, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x i16>, i32 } @llvm.vp.load.ff.nxv32i16.p0(ptr %ptr, <vscale x 32 x i1> %m, i32 %evl)
+ ret { <vscale x 32 x i16>, i32 } %load
+}
+
+define { <vscale x 32 x i16>, i32 } @vploadff_nxv32i16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32i16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x i16>, i32 } @llvm.vp.load.ff.nxv32i16.p0(ptr %ptr, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 32 x i16>, i32 } %load
+}
+
+define { <vscale x 1 x i32>, i32 } @vploadff_nxv1i32(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i32>, i32 } @llvm.vp.load.ff.nxv1i32.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x i32>, i32 } %load
+}
+
+define { <vscale x 1 x i32>, i32 } @vploadff_nxv1i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i32>, i32 } @llvm.vp.load.ff.nxv1i32.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x i32>, i32 } %load
+}
+
+define { <vscale x 2 x i32>, i32 } @vploadff_nxv2i32(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i32>, i32 } @llvm.vp.load.ff.nxv2i32.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x i32>, i32 } %load
+}
+
+define { <vscale x 2 x i32>, i32 } @vploadff_nxv2i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i32>, i32 } @llvm.vp.load.ff.nxv2i32.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x i32>, i32 } %load
+}
+
+define { <vscale x 4 x i32>, i32 } @vploadff_nxv4i32(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x i32>, i32 } %load
+}
+
+define { <vscale x 4 x i32>, i32 } @vploadff_nxv4i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x i32>, i32 } %load
+}
+
+define { <vscale x 8 x i32>, i32 } @vploadff_nxv8i32(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i32>, i32 } @llvm.vp.load.ff.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x i32>, i32 } %load
+}
+
+define { <vscale x 8 x i32>, i32 } @vploadff_nxv8i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i32>, i32 } @llvm.vp.load.ff.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x i32>, i32 } %load
+}
+
+define { <vscale x 16 x i32>, i32 } @vploadff_nxv16i32(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i32>, i32 } @llvm.vp.load.ff.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x i32>, i32 } %load
+}
+
+define { <vscale x 16 x i32>, i32 } @vploadff_nxv16i32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16i32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x i32>, i32 } @llvm.vp.load.ff.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x i32>, i32 } %load
+}
+
+define { <vscale x 1 x i64>, i32 } @vploadff_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x i64>, i32 } %load
+}
+
+define { <vscale x 1 x i64>, i32 } @vploadff_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x i64>, i32 } %load
+}
+
+define { <vscale x 2 x i64>, i32 } @vploadff_nxv2i64(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i64>, i32 } @llvm.vp.load.ff.nxv2i64.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x i64>, i32 } %load
+}
+
+define { <vscale x 2 x i64>, i32 } @vploadff_nxv2i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x i64>, i32 } @llvm.vp.load.ff.nxv2i64.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x i64>, i32 } %load
+}
+
+define { <vscale x 4 x i64>, i32 } @vploadff_nxv4i64(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i64>, i32 } @llvm.vp.load.ff.nxv4i64.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x i64>, i32 } %load
+}
+
+define { <vscale x 4 x i64>, i32 } @vploadff_nxv4i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x i64>, i32 } @llvm.vp.load.ff.nxv4i64.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x i64>, i32 } %load
+}
+
+define { <vscale x 8 x i64>, i32 } @vploadff_nxv8i64(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i64>, i32 } @llvm.vp.load.ff.nxv8i64.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x i64>, i32 } %load
+}
+
+define { <vscale x 8 x i64>, i32 } @vploadff_nxv8i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8i64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x i64>, i32 } @llvm.vp.load.ff.nxv8i64.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x i64>, i32 } %load
+}
+
+define { <vscale x 1 x half>, i32 } @vploadff_nxv1f16(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x half>, i32 } @llvm.vp.load.ff.nxv1f16.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x half>, i32 } %load
+}
+
+define { <vscale x 1 x half>, i32 } @vploadff_nxv1f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x half>, i32 } @llvm.vp.load.ff.nxv1f16.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x half>, i32 } %load
+}
+
+define { <vscale x 2 x half>, i32 } @vploadff_nxv2f16(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x half>, i32 } @llvm.vp.load.ff.nxv2f16.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x half>, i32 } %load
+}
+
+define { <vscale x 2 x half>, i32 } @vploadff_nxv2f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x half>, i32 } @llvm.vp.load.ff.nxv2f16.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x half>, i32 } %load
+}
+
+define { <vscale x 4 x half>, i32 } @vploadff_nxv4f16(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x half>, i32 } @llvm.vp.load.ff.nxv4f16.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x half>, i32 } %load
+}
+
+define { <vscale x 4 x half>, i32 } @vploadff_nxv4f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x half>, i32 } @llvm.vp.load.ff.nxv4f16.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x half>, i32 } %load
+}
+
+define { <vscale x 8 x half>, i32 } @vploadff_nxv8f16(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x half>, i32 } @llvm.vp.load.ff.nxv8f16.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x half>, i32 } %load
+}
+
+define { <vscale x 8 x half>, i32 } @vploadff_nxv8f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x half>, i32 } @llvm.vp.load.ff.nxv8f16.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x half>, i32 } %load
+}
+
+define { <vscale x 16 x half>, i32 } @vploadff_nxv16f16(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x half>, i32 } @llvm.vp.load.ff.nxv16f16.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x half>, i32 } %load
+}
+
+define { <vscale x 16 x half>, i32 } @vploadff_nxv16f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x half>, i32 } @llvm.vp.load.ff.nxv16f16.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x half>, i32 } %load
+}
+
+define { <vscale x 32 x half>, i32 } @vploadff_nxv32f16(ptr %ptr, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x half>, i32 } @llvm.vp.load.ff.nxv32f16.p0(ptr %ptr, <vscale x 32 x i1> %m, i32 %evl)
+ ret { <vscale x 32 x half>, i32 } %load
+}
+
+define { <vscale x 32 x half>, i32 } @vploadff_nxv32f16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32f16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x half>, i32 } @llvm.vp.load.ff.nxv32f16.p0(ptr %ptr, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 32 x half>, i32 } %load
+}
+
+define { <vscale x 1 x float>, i32 } @vploadff_nxv1f32(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x float>, i32 } @llvm.vp.load.ff.nxv1f32.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x float>, i32 } %load
+}
+
+define { <vscale x 1 x float>, i32 } @vploadff_nxv1f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x float>, i32 } @llvm.vp.load.ff.nxv1f32.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x float>, i32 } %load
+}
+
+define { <vscale x 2 x float>, i32 } @vploadff_nxv2f32(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x float>, i32 } @llvm.vp.load.ff.nxv2f32.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x float>, i32 } %load
+}
+
+define { <vscale x 2 x float>, i32 } @vploadff_nxv2f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x float>, i32 } @llvm.vp.load.ff.nxv2f32.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x float>, i32 } %load
+}
+
+define { <vscale x 4 x float>, i32 } @vploadff_nxv4f32(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x float>, i32 } @llvm.vp.load.ff.nxv4f32.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x float>, i32 } %load
+}
+
+define { <vscale x 4 x float>, i32 } @vploadff_nxv4f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x float>, i32 } @llvm.vp.load.ff.nxv4f32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x float>, i32 } %load
+}
+
+define { <vscale x 8 x float>, i32 } @vploadff_nxv8f32(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x float>, i32 } @llvm.vp.load.ff.nxv8f32.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x float>, i32 } %load
+}
+
+define { <vscale x 8 x float>, i32 } @vploadff_nxv8f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x float>, i32 } @llvm.vp.load.ff.nxv8f32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x float>, i32 } %load
+}
+
+define { <vscale x 16 x float>, i32 } @vploadff_nxv16f32(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x float>, i32 } @llvm.vp.load.ff.nxv16f32.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x float>, i32 } %load
+}
+
+define { <vscale x 16 x float>, i32 } @vploadff_nxv16f32_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16f32_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x float>, i32 } @llvm.vp.load.ff.nxv16f32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x float>, i32 } %load
+}
+
+define { <vscale x 1 x double>, i32 } @vploadff_nxv1f64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x double>, i32 } @llvm.vp.load.ff.nxv1f64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x double>, i32 } %load
+}
+
+define { <vscale x 1 x double>, i32 } @vploadff_nxv1f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x double>, i32 } @llvm.vp.load.ff.nxv1f64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x double>, i32 } %load
+}
+
+define { <vscale x 2 x double>, i32 } @vploadff_nxv2f64(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x double>, i32 } @llvm.vp.load.ff.nxv2f64.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x double>, i32 } %load
+}
+
+define { <vscale x 2 x double>, i32 } @vploadff_nxv2f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x double>, i32 } @llvm.vp.load.ff.nxv2f64.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x double>, i32 } %load
+}
+
+define { <vscale x 4 x double>, i32 } @vploadff_nxv4f64(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x double>, i32 } @llvm.vp.load.ff.nxv4f64.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x double>, i32 } %load
+}
+
+define { <vscale x 4 x double>, i32 } @vploadff_nxv4f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x double>, i32 } @llvm.vp.load.ff.nxv4f64.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x double>, i32 } %load
+}
+
+define { <vscale x 8 x double>, i32 } @vploadff_nxv8f64(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x double>, i32 } @llvm.vp.load.ff.nxv8f64.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x double>, i32 } %load
+}
+
+define { <vscale x 8 x double>, i32 } @vploadff_nxv8f64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8f64_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vle64ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x double>, i32 } @llvm.vp.load.ff.nxv8f64.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x double>, i32 } %load
+}
+
+define { <vscale x 1 x bfloat>, i32 } @vploadff_nxv1bf16(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x bfloat>, i32 } @llvm.vp.load.ff.nxv1bf16.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+ ret { <vscale x 1 x bfloat>, i32 } %load
+}
+
+define { <vscale x 1 x bfloat>, i32 } @vploadff_nxv1bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv1bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 1 x bfloat>, i32 } @llvm.vp.load.ff.nxv1bf16.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 1 x bfloat>, i32 } %load
+}
+
+define { <vscale x 2 x bfloat>, i32 } @vploadff_nxv2bf16(ptr %ptr, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x bfloat>, i32 } @llvm.vp.load.ff.nxv2bf16.p0(ptr %ptr, <vscale x 2 x i1> %m, i32 %evl)
+ ret { <vscale x 2 x bfloat>, i32 } %load
+}
+
+define { <vscale x 2 x bfloat>, i32 } @vploadff_nxv2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv2bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 2 x bfloat>, i32 } @llvm.vp.load.ff.nxv2bf16.p0(ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 2 x bfloat>, i32 } %load
+}
+
+define { <vscale x 4 x bfloat>, i32 } @vploadff_nxv4bf16(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x bfloat>, i32 } @llvm.vp.load.ff.nxv4bf16.p0(ptr %ptr, <vscale x 4 x i1> %m, i32 %evl)
+ ret { <vscale x 4 x bfloat>, i32 } %load
+}
+
+define { <vscale x 4 x bfloat>, i32 } @vploadff_nxv4bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv4bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 4 x bfloat>, i32 } @llvm.vp.load.ff.nxv4bf16.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 4 x bfloat>, i32 } %load
+}
+
+define { <vscale x 8 x bfloat>, i32 } @vploadff_nxv8bf16(ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x bfloat>, i32 } @llvm.vp.load.ff.nxv8bf16.p0(ptr %ptr, <vscale x 8 x i1> %m, i32 %evl)
+ ret { <vscale x 8 x bfloat>, i32 } %load
+}
+
+define { <vscale x 8 x bfloat>, i32 } @vploadff_nxv8bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv8bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 8 x bfloat>, i32 } @llvm.vp.load.ff.nxv8bf16.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 8 x bfloat>, i32 } %load
+}
+
+define { <vscale x 16 x bfloat>, i32 } @vploadff_nxv16bf16(ptr %ptr, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x bfloat>, i32 } @llvm.vp.load.ff.nxv16bf16.p0(ptr %ptr, <vscale x 16 x i1> %m, i32 %evl)
+ ret { <vscale x 16 x bfloat>, i32 } %load
+}
+
+define { <vscale x 16 x bfloat>, i32 } @vploadff_nxv16bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv16bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 16 x bfloat>, i32 } @llvm.vp.load.ff.nxv16bf16.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 16 x bfloat>, i32 } %load
+}
+
+define { <vscale x 32 x bfloat>, i32 } @vploadff_nxv32bf16(ptr %ptr, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x bfloat>, i32 } @llvm.vp.load.ff.nxv32bf16.p0(ptr %ptr, <vscale x 32 x i1> %m, i32 %evl)
+ ret { <vscale x 32 x bfloat>, i32 } %load
+}
+
+define { <vscale x 32 x bfloat>, i32 } @vploadff_nxv32bf16_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv32bf16_allones_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16ff.v v8, (a0)
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 32 x bfloat>, i32 } @llvm.vp.load.ff.nxv32bf16.p0(ptr %ptr, <vscale x 32 x i1> splat (i1 true), i32 %evl)
+ ret { <vscale x 32 x bfloat>, i32 } %load
+}
+
+define { <vscale x 3 x i8>, i32 } @vploadff_nxv3i8(ptr %ptr, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vploadff_nxv3i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vle8ff.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vl
+; CHECK-NEXT: ret
+ %load = call { <vscale x 3 x i8>, i32 } @llvm.vp.load.ff.nxv3i8.p0(ptr %ptr, <vscale x 3 x i1> %m, i32 %evl)
+ ret { <vscale x 3 x i8>, i32 } %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll b/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
index f29c74a..697c582 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
@@ -21,7 +21,7 @@ define <vscale x 4 x i32> @intrinsic_vsha2cl_vv_nxv4i32_nxv4i32(<vscale x 4 x i3
; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv4i32_nxv4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT: vsha2ch.vv v8, v10, v12
+; CHECK-NEXT: vsha2cl.vv v8, v10, v12
; CHECK-NEXT: ret
entry:
%a = call <vscale x 4 x i32> @llvm.riscv.vsha2cl.nxv4i32.nxv4i32(
@@ -45,7 +45,7 @@ define <vscale x 8 x i32> @intrinsic_vsha2cl_vv_nxv8i32_nxv8i32(<vscale x 8 x i3
; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv8i32_nxv8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT: vsha2ch.vv v8, v12, v16
+; CHECK-NEXT: vsha2cl.vv v8, v12, v16
; CHECK-NEXT: ret
entry:
%a = call <vscale x 8 x i32> @llvm.riscv.vsha2cl.nxv8i32.nxv8i32(
@@ -70,7 +70,7 @@ define <vscale x 16 x i32> @intrinsic_vsha2cl_vv_nxv16i32_nxv16i32(<vscale x 16
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl8re32.v v24, (a0)
; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma
-; CHECK-NEXT: vsha2ch.vv v8, v16, v24
+; CHECK-NEXT: vsha2cl.vv v8, v16, v24
; CHECK-NEXT: ret
entry:
%a = call <vscale x 16 x i32> @llvm.riscv.vsha2cl.nxv16i32.nxv16i32(
@@ -94,7 +94,7 @@ define <vscale x 4 x i64> @intrinsic_vsha2cl_vv_nxv4i64_nxv4i64(<vscale x 4 x i6
; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv4i64_nxv4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT: vsha2ch.vv v8, v12, v16
+; CHECK-NEXT: vsha2cl.vv v8, v12, v16
; CHECK-NEXT: ret
entry:
%a = call <vscale x 4 x i64> @llvm.riscv.vsha2cl.nxv4i64.nxv4i64(
@@ -119,7 +119,7 @@ define <vscale x 8 x i64> @intrinsic_vsha2cl_vv_nxv8i64_nxv8i64(<vscale x 8 x i6
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl8re64.v v24, (a0)
; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma
-; CHECK-NEXT: vsha2ch.vv v8, v16, v24
+; CHECK-NEXT: vsha2cl.vv v8, v16, v24
; CHECK-NEXT: ret
entry:
%a = call <vscale x 8 x i64> @llvm.riscv.vsha2cl.nxv8i64.nxv8i64(
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index c9c49e8..cb046cd 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -204,18 +204,16 @@ define i64 @load_i64(ptr %p) {
; RV64IZBKB-NEXT: lbu a2, 5(a0)
; RV64IZBKB-NEXT: lbu a3, 6(a0)
; RV64IZBKB-NEXT: lbu a4, 7(a0)
-; RV64IZBKB-NEXT: lbu a5, 0(a0)
-; RV64IZBKB-NEXT: lbu a6, 1(a0)
-; RV64IZBKB-NEXT: lbu a7, 2(a0)
-; RV64IZBKB-NEXT: lbu a0, 3(a0)
+; RV64IZBKB-NEXT: lbu a5, 1(a0)
+; RV64IZBKB-NEXT: lbu a6, 2(a0)
+; RV64IZBKB-NEXT: lbu a7, 3(a0)
+; RV64IZBKB-NEXT: lbu a0, 0(a0)
+; RV64IZBKB-NEXT: packh a3, a3, a4
; RV64IZBKB-NEXT: packh a1, a1, a2
-; RV64IZBKB-NEXT: packh a2, a3, a4
-; RV64IZBKB-NEXT: packh a3, a5, a6
-; RV64IZBKB-NEXT: packh a0, a7, a0
-; RV64IZBKB-NEXT: slli a2, a2, 16
-; RV64IZBKB-NEXT: slli a0, a0, 16
-; RV64IZBKB-NEXT: or a1, a2, a1
-; RV64IZBKB-NEXT: or a0, a0, a3
+; RV64IZBKB-NEXT: packh a2, a6, a7
+; RV64IZBKB-NEXT: packh a0, a0, a5
+; RV64IZBKB-NEXT: packw a1, a1, a3
+; RV64IZBKB-NEXT: packw a0, a0, a2
; RV64IZBKB-NEXT: pack a0, a0, a1
; RV64IZBKB-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/xqcilsm-memset.ll b/llvm/test/CodeGen/RISCV/xqcilsm-memset.ll
new file mode 100644
index 0000000..988bb6f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcilsm-memset.ll
@@ -0,0 +1,900 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN: | FileCheck %s -check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+experimental-xqcilsm < %s \
+; RUN: | FileCheck %s -check-prefixes=RV32IXQCILSM
+
+%struct.anon = type { [16 x i32] }
+%struct.anon.0 = type { [47 x i32] }
+%struct.anon.1 = type { [48 x i32] }
+%struct.anon.2 = type { [64 x i8] }
+%struct.struct1_t = type { [16 x i32] }
+
+@struct1 = common dso_local local_unnamed_addr global %struct.anon zeroinitializer, align 4
+@struct4b = common dso_local local_unnamed_addr global %struct.anon.0 zeroinitializer, align 4
+@struct4b1 = common dso_local local_unnamed_addr global %struct.anon.1 zeroinitializer, align 4
+@struct2 = common dso_local local_unnamed_addr global %struct.anon.2 zeroinitializer, align 1
+@arr1 = common dso_local local_unnamed_addr global [100 x i32] zeroinitializer, align 4
+@struct1_ = common dso_local local_unnamed_addr global %struct.struct1_t zeroinitializer, align 4
+
+define void @test1(ptr nocapture %p, i32 %n) nounwind {
+; RV32I-LABEL: test1:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: mv a2, a1
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test1:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: mv a2, a1
+; RV32IXQCILSM-NEXT: li a1, 0
+; RV32IXQCILSM-NEXT: tail memset
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 1 %p, i8 0, i32 %n, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1)
+
+define void @test2(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: li a1, 165
+; RV32I-NEXT: li a2, 128
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test2:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a1, 678490
+; RV32IXQCILSM-NEXT: addi a1, a1, 1445
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 64(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 128, i1 false)
+ ret void
+}
+
+define void @test2a(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2a:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: li a1, 165
+; RV32I-NEXT: li a2, 188
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test2a:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a1, 678490
+; RV32IXQCILSM-NEXT: addi a1, a1, 1445
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 15, 64(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 124(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 188, i1 false)
+ ret void
+}
+
+define void @test2b(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2b:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: li a1, 165
+; RV32I-NEXT: li a2, 192
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test2b:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: li a1, 165
+; RV32IXQCILSM-NEXT: li a2, 192
+; RV32IXQCILSM-NEXT: tail memset
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 192, i1 false)
+ ret void
+}
+
+define void @test2c(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2c:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: li a1, 165
+; RV32I-NEXT: li a2, 128
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test2c:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a1, 678490
+; RV32IXQCILSM-NEXT: addi a1, a1, 1445
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 64(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 128, i1 false)
+ ret void
+}
+
+define void @test2d(ptr nocapture %p) nounwind {
+; RV32I-LABEL: test2d:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: li a1, -91
+; RV32I-NEXT: lui a2, 1048570
+; RV32I-NEXT: lui a3, 678490
+; RV32I-NEXT: addi a2, a2, 1445
+; RV32I-NEXT: addi a3, a3, 1445
+; RV32I-NEXT: sw a3, 0(a0)
+; RV32I-NEXT: sw a3, 4(a0)
+; RV32I-NEXT: sh a2, 8(a0)
+; RV32I-NEXT: sb a1, 10(a0)
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test2d:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: li a1, -91
+; RV32IXQCILSM-NEXT: lui a2, 1048570
+; RV32IXQCILSM-NEXT: lui a3, 678490
+; RV32IXQCILSM-NEXT: addi a2, a2, 1445
+; RV32IXQCILSM-NEXT: addi a3, a3, 1445
+; RV32IXQCILSM-NEXT: sw a3, 0(a0)
+; RV32IXQCILSM-NEXT: sw a3, 4(a0)
+; RV32IXQCILSM-NEXT: sh a2, 8(a0)
+; RV32IXQCILSM-NEXT: sb a1, 10(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 -91, i32 11, i1 false)
+ ret void
+}
+
+
+define ptr @test3(ptr %p) nounwind {
+; RV32I-LABEL: test3:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: li a2, 256
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test3:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: li a2, 256
+; RV32IXQCILSM-NEXT: li a1, 0
+; RV32IXQCILSM-NEXT: tail memset
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 0, i32 256, i1 false)
+ ret ptr %p
+}
+
+define ptr @test3a(ptr %p) nounwind {
+; RV32I-LABEL: test3a:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: li a2, 128
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test3a:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 64(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 %p, i8 0, i32 128, i1 false)
+ ret ptr %p
+}
+
+define void @test4() nounwind {
+; RV32I-LABEL: test4:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(struct1)
+; RV32I-NEXT: addi a0, a0, %lo(struct1)
+; RV32I-NEXT: li a2, 64
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test4:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(struct1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(struct1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @struct1, i8 0, i32 64, i1 false)
+ ret void
+}
+
+define void @test4a(ptr nocapture %s) nounwind {
+; RV32I-LABEL: test4a:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: li a1, 166
+; RV32I-NEXT: li a2, 64
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test4a:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a1, 682602
+; RV32IXQCILSM-NEXT: addi a1, a1, 1702
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 0(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 %s, i8 -90, i32 64, i1 false)
+ ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
+
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+
+define void @test4b() nounwind {
+; RV32I-LABEL: test4b:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lui a0, %hi(struct4b)
+; RV32I-NEXT: addi a0, a0, %lo(struct4b)
+; RV32I-NEXT: li a2, 188
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: call memset
+; RV32I-NEXT: lui a0, %hi(struct4b1)
+; RV32I-NEXT: addi a0, a0, %lo(struct4b1)
+; RV32I-NEXT: li a2, 192
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test4b:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a1, %hi(struct4b)
+; RV32IXQCILSM-NEXT: addi a1, a1, %lo(struct4b)
+; RV32IXQCILSM-NEXT: lui a0, %hi(struct4b1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(struct4b1)
+; RV32IXQCILSM-NEXT: li a2, 192
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 15, 64(a1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 124(a1)
+; RV32IXQCILSM-NEXT: li a1, 0
+; RV32IXQCILSM-NEXT: tail memset
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @struct4b, i8 0, i32 188, i1 false)
+ tail call void @llvm.memset.p0.i32(ptr align 4 @struct4b1, i8 0, i32 192, i1 false)
+ ret void
+}
+
+define void @test5() nounwind {
+; RV32I-LABEL: test5:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(struct2)
+; RV32I-NEXT: addi a0, a0, %lo(struct2)
+; RV32I-NEXT: li a2, 64
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test5:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(struct2)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(struct2)
+; RV32IXQCILSM-NEXT: li a2, 64
+; RV32IXQCILSM-NEXT: li a1, 0
+; RV32IXQCILSM-NEXT: tail memset
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 1 @struct2, i8 0, i32 64, i1 false)
+ ret void
+}
+
+define i32 @test6() nounwind {
+; RV32I-LABEL: test6:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sw zero, 12(sp)
+; RV32IXQCILSM-NEXT: li a0, 0
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+entry:
+ %x = alloca i32, align 4
+ call void @llvm.memset.p0.i32(ptr align 4 %x, i8 0, i32 4, i1 false)
+ %0 = load i32, ptr %x, align 4
+ ret i32 %0
+}
+
+define zeroext i8 @test6b_c() nounwind {
+; RV32I-LABEL: test6b_c:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sb zero, 12(sp)
+; RV32I-NEXT: lbu a0, 12(sp)
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6b_c:
+; RV32IXQCILSM: # %bb.0:
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sb zero, 12(sp)
+; RV32IXQCILSM-NEXT: lbu a0, 12(sp)
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+ %x = alloca i8, align 4
+ call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %x)
+ call void @llvm.memset.p0.i32(ptr nonnull align 4 %x, i8 0, i32 1, i1 false)
+ %x.0.x.0. = load volatile i8, ptr %x, align 4
+ call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %x)
+ ret i8 %x.0.x.0.
+}
+
+define signext i16 @test6b_s() nounwind {
+; RV32I-LABEL: test6b_s:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sh zero, 12(sp)
+; RV32I-NEXT: lh a0, 12(sp)
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6b_s:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sh zero, 12(sp)
+; RV32IXQCILSM-NEXT: lh a0, 12(sp)
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+entry:
+ %x = alloca i16, align 4
+ call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %x)
+ store i16 0, ptr %x, align 4
+ %x.0.x.0. = load volatile i16, ptr %x, align 4
+ call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %x)
+ ret i16 %x.0.x.0.
+}
+
+define i32 @test6b_l() nounwind {
+; RV32I-LABEL: test6b_l:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: lw a0, 12(sp)
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6b_l:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sw zero, 12(sp)
+; RV32IXQCILSM-NEXT: lw a0, 12(sp)
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+entry:
+ %x = alloca i32, align 4
+ call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %x)
+ store i32 0, ptr %x, align 4
+ %x.0.x.0. = load volatile i32, ptr %x, align 4
+ call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %x)
+ ret i32 %x.0.x.0.
+}
+
+define i64 @test6b_ll() nounwind {
+; RV32I-LABEL: test6b_ll:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: lw a0, 8(sp)
+; RV32I-NEXT: lw a1, 12(sp)
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6b_ll:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sw zero, 8(sp)
+; RV32IXQCILSM-NEXT: sw zero, 12(sp)
+; RV32IXQCILSM-NEXT: lw a0, 8(sp)
+; RV32IXQCILSM-NEXT: lw a1, 12(sp)
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+entry:
+ %x = alloca i64, align 8
+ call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %x)
+ call void @llvm.memset.p0.i32(ptr nonnull align 8 %x, i8 0, i32 8, i1 false)
+ %x.0.x.0. = load volatile i64, ptr %x, align 8
+ call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %x)
+ ret i64 %x.0.x.0.
+}
+
+define zeroext i8 @test6c_c() nounwind {
+; RV32I-LABEL: test6c_c:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sb zero, 15(sp)
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6c_c:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sb zero, 15(sp)
+; RV32IXQCILSM-NEXT: li a0, 0
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+entry:
+ %x = alloca i8
+ call void @llvm.memset.p0.i32(ptr align 1 %x, i8 0, i32 1, i1 false)
+ %0 = load i8, ptr %x, align 1
+ ret i8 %0
+}
+
+define signext i16 @test6c_s() nounwind {
+; RV32I-LABEL: test6c_s:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sh zero, 14(sp)
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6c_s:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sh zero, 14(sp)
+; RV32IXQCILSM-NEXT: li a0, 0
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+entry:
+ %x = alloca i16
+ call void @llvm.memset.p0.i32(ptr align 2 %x, i8 0, i32 2, i1 false)
+ %0 = load i16, ptr %x, align 2
+ ret i16 %0
+}
+
+define i32 @test6c_l() nounwind {
+; RV32I-LABEL: test6c_l:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6c_l:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sw zero, 12(sp)
+; RV32IXQCILSM-NEXT: li a0, 0
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+entry:
+ %x = alloca i32, align 4
+ call void @llvm.memset.p0.i32(ptr align 4 %x, i8 0, i32 4, i1 false)
+ %0 = load i32, ptr %x, align 4
+ ret i32 %0
+}
+
+define i64 @test6c_ll() nounwind {
+; RV32I-LABEL: test6c_ll:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: li a0, 0
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test6c_ll:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: addi sp, sp, -16
+; RV32IXQCILSM-NEXT: sw zero, 8(sp)
+; RV32IXQCILSM-NEXT: sw zero, 12(sp)
+; RV32IXQCILSM-NEXT: li a0, 0
+; RV32IXQCILSM-NEXT: li a1, 0
+; RV32IXQCILSM-NEXT: addi sp, sp, 16
+; RV32IXQCILSM-NEXT: ret
+entry:
+ %x = alloca i64, align 8
+ call void @llvm.memset.p0.i32(ptr align 8 %x, i8 0, i32 8, i1 false)
+ %0 = load i64, ptr %x, align 8
+ ret i64 %0
+}
+
+define void @test7() nounwind {
+; RV32I-LABEL: test7:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: sw zero, %lo(arr1)(a0)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: sw zero, 4(a0)
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test7:
+; RV32IXQCILSM: # %bb.0:
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: sw zero, %lo(arr1)(a0)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: sw zero, 4(a0)
+; RV32IXQCILSM-NEXT: ret
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 8, i1 false)
+ ret void
+}
+
+define void @test7a() nounwind {
+; RV32I-LABEL: test7a:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test7a:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: ret
+entry:
+ call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 0, i1 false)
+ ret void
+}
+
+define void @test7a_unalign() nounwind {
+; RV32I-LABEL: test7a_unalign:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: li a1, -1
+; RV32I-NEXT: sw a1, %lo(arr1)(a0)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: sw a1, 4(a0)
+; RV32I-NEXT: sw a1, 8(a0)
+; RV32I-NEXT: sw a1, 12(a0)
+; RV32I-NEXT: sb a1, 16(a0)
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test7a_unalign:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: li a1, -1
+; RV32IXQCILSM-NEXT: sw a1, %lo(arr1)(a0)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: sw a1, 4(a0)
+; RV32IXQCILSM-NEXT: sw a1, 8(a0)
+; RV32IXQCILSM-NEXT: sw a1, 12(a0)
+; RV32IXQCILSM-NEXT: sb a1, 16(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 -1, i32 17, i1 false)
+ ret void
+}
+
+define void @test7b() nounwind {
+; RV32I-LABEL: test7b:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a1, 255
+; RV32I-NEXT: li a2, 68
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test7b:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: li a1, -1
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 1, 64(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 -1, i32 68, i1 false)
+ ret void
+}
+
+define void @test7c() nounwind {
+; RV32I-LABEL: test7c:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a1, 128
+; RV32I-NEXT: li a2, 128
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test7c:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: lui a1, 526344
+; RV32IXQCILSM-NEXT: addi a1, a1, 128
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 64(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 -128, i32 128, i1 false)
+ ret void
+}
+
+define void @test7d() nounwind {
+; RV32I-LABEL: test7d:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a1, 13
+; RV32I-NEXT: li a2, 148
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test7d:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: lui a1, 53457
+; RV32IXQCILSM-NEXT: addi a1, a1, -755
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 15, 64(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 6, 124(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 13, i32 148, i1 false)
+ ret void
+}
+
+define void @test7e() nounwind {
+; RV32I-LABEL: test7e:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a1, 239
+; RV32I-NEXT: li a2, 100
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test7e:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: lui a1, 982783
+; RV32IXQCILSM-NEXT: addi a1, a1, -17
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi a1, 9, 64(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 -17, i32 100, i1 false)
+ ret void
+}
+
+define void @test8() nounwind {
+; RV32I-LABEL: test8:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: sw zero, %lo(arr1)(a0)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: sw zero, 4(a0)
+; RV32I-NEXT: sw zero, 8(a0)
+; RV32I-NEXT: sw zero, 12(a0)
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test8:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: sw zero, %lo(arr1)(a0)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: sw zero, 4(a0)
+; RV32IXQCILSM-NEXT: sw zero, 8(a0)
+; RV32IXQCILSM-NEXT: sw zero, 12(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 16, i1 false)
+ ret void
+}
+
+define void @test9() nounwind {
+; RV32I-LABEL: test9:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: sw zero, %lo(arr1)(a0)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: sw zero, 20(a0)
+; RV32I-NEXT: sw zero, 24(a0)
+; RV32I-NEXT: sw zero, 28(a0)
+; RV32I-NEXT: sw zero, 4(a0)
+; RV32I-NEXT: sw zero, 8(a0)
+; RV32I-NEXT: sw zero, 12(a0)
+; RV32I-NEXT: sw zero, 16(a0)
+; RV32I-NEXT: ret
+;
+; RV32IXQCILSM-LABEL: test9:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: sw zero, %lo(arr1)(a0)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: sw zero, 20(a0)
+; RV32IXQCILSM-NEXT: sw zero, 24(a0)
+; RV32IXQCILSM-NEXT: sw zero, 28(a0)
+; RV32IXQCILSM-NEXT: sw zero, 4(a0)
+; RV32IXQCILSM-NEXT: sw zero, 8(a0)
+; RV32IXQCILSM-NEXT: sw zero, 12(a0)
+; RV32IXQCILSM-NEXT: sw zero, 16(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 32, i1 false)
+ ret void
+}
+
+define void @test10() nounwind {
+; RV32I-LABEL: test10:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a2, 60
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test10:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 15, 0(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 60, i1 false)
+ ret void
+}
+
+define void @test11() nounwind {
+; RV32I-LABEL: test11:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a2, 64
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test11:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 64, i1 false)
+ ret void
+}
+
+define void @test12() nounwind {
+; RV32I-LABEL: test12:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a2, 120
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test12:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 14, 64(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 120, i1 false)
+ ret void
+}
+
+define void @test13() nounwind {
+; RV32I-LABEL: test13:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a2, 124
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test13:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 15, 64(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 124, i1 false)
+ ret void
+}
+
+define void @test14() nounwind {
+; RV32I-LABEL: test14:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a2, 180
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test14:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 15, 64(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 14, 124(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 180, i1 false)
+ ret void
+}
+
+define void @test15() nounwind {
+; RV32I-LABEL: test15:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a2, 184
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test15:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 15, 64(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 15, 124(a0)
+; RV32IXQCILSM-NEXT: ret
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 184, i1 false)
+ ret void
+}
+
+define void @test15a() nounwind {
+; RV32I-LABEL: test15a:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a1, 165
+; RV32I-NEXT: li a2, 192
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test15a:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: li a1, 165
+; RV32IXQCILSM-NEXT: li a2, 192
+; RV32IXQCILSM-NEXT: tail memset
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 -91, i32 192, i1 false)
+ ret void
+}
+
+define void @test15b() nounwind {
+; RV32I-LABEL: test15b:
+; RV32I: # %bb.0:
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a2, 188
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test15b:
+; RV32IXQCILSM: # %bb.0:
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 0(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 15, 64(a0)
+; RV32IXQCILSM-NEXT: qc.setwmi zero, 16, 124(a0)
+; RV32IXQCILSM-NEXT: ret
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 188, i1 false)
+ ret void
+}
+
+define void @test15c() nounwind {
+; RV32I-LABEL: test15c:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: lui a0, %hi(arr1)
+; RV32I-NEXT: addi a0, a0, %lo(arr1)
+; RV32I-NEXT: li a2, 192
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: tail memset
+;
+; RV32IXQCILSM-LABEL: test15c:
+; RV32IXQCILSM: # %bb.0: # %entry
+; RV32IXQCILSM-NEXT: lui a0, %hi(arr1)
+; RV32IXQCILSM-NEXT: addi a0, a0, %lo(arr1)
+; RV32IXQCILSM-NEXT: li a2, 192
+; RV32IXQCILSM-NEXT: li a1, 0
+; RV32IXQCILSM-NEXT: tail memset
+entry:
+ tail call void @llvm.memset.p0.i32(ptr align 4 @arr1, i8 0, i32 192, i1 false)
+ ret void
+}