99 files changed, 8764 insertions, 7919 deletions
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
index 98d5bd5..5a2d08a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -4,45 +4,45 @@
 define void @foo_no_vscale_range() {
 ; CHECK-LABEL: 'foo_no_vscale_range'
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1(<vscale x 1 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -97,25 +97,25 @@ define void @foo_no_vscale_range() {
 
 define void @foo_vscale_range_1_16() vscale_range(1,16) {
 ; CHECK-LABEL: 'foo_vscale_range_1_16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -146,25 +146,25 @@ define void @foo_vscale_range_1_16() vscale_range(1,16) {
 
 define void @foo_vscale_range_1_16384() vscale_range(1,16384) {
 ; CHECK-LABEL: 'foo_vscale_range_1_16384'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1(<vscale x 2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1(<vscale x 4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll
index 44c0b5b..99c74c4 100644
--- a/llvm/test/Analysis/CostModel/RISCV/gep.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll
@@ -268,7 +268,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = getelementptr i8, ptr %base, i32 42
@@ -280,7 +280,7 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = getelementptr i8, ptr %base, i32 42
@@ -338,7 +338,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x5 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %5, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %6 = getelementptr i8, ptr %base, i32 0
@@ -350,7 +350,7 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.vp.store.v2i8.p0(<2 x i8> undef, ptr %11, <2 x i1> undef, i32 undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = getelementptr i8, ptr %base, i32 0
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-expandload-compressstore.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-expandload-compressstore.ll
new file mode 100644
index 0000000..ea5f344
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-expandload-compressstore.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin --type-based-intrinsic-cost=true | FileCheck %s --check-prefixes=TYPEBASED
+
+define void @expand_load() {
+; CHECK-LABEL: 'expand_load'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 poison, <2 x i1> poison, <2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 poison, <4 x i1> poison, <4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 poison, <8 x i1> poison, <8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 poison, <16 x i1> poison, <16 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'expand_load'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 poison, <2 x i1> poison, <2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 poison, <4 x i1> poison, <4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 poison, <8 x i1> poison, <8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 poison, <16 x i1> poison, <16 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align 8 poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align 8 poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align 8 poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align 8 poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t1 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr poison, <2 x i1> poison, <2 x i8> poison)
+  %t2 = call <4 x i8> @llvm.masked.expandload.v4i8(ptr poison, <4 x i1> poison, <4 x i8> poison)
+  %t3 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr poison, <8 x i1> poison, <8 x i8> poison)
+  %t4 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr poison, <16 x i1> poison, <16 x i8> poison)
+  %t5 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align(8) poison, <2 x i1> poison, <2 x i64> poison)
+  %t6 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align(8) poison, <4 x i1> poison, <4 x i64> poison)
+  %t7 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align(8) poison, <8 x i1> poison, <8 x i64> poison)
+  %t8 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align(8) poison, <16 x i1> poison, <16 x i64> poison)
+  %t9 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr poison, <2 x i1> poison, <2 x i64> poison)
+  %t10 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr poison, <4 x i1> poison, <4 x i64> poison)
+  %t11 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr poison, <8 x i1> poison, <8 x i64> poison)
+  %t12 = call <16 x i64> @llvm.masked.expandload.v16i64(ptr poison, <16 x i1> poison, <16 x i64> poison)
+  %t13 = call <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr poison, <vscale x 2 x i1> poison, <vscale x 2 x i8> poison)
+  %t14 = call <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr poison, <vscale x 4 x i1> poison, <vscale x 4 x i8> poison)
+  %t15 = call <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr poison, <vscale x 8 x i1> poison, <vscale x 8 x i8> poison)
+  %t16 = call <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr poison, <vscale x 16 x i1> poison, <vscale x 16 x i8> poison)
+  %t17 = call <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr align(8) poison, <vscale x 2 x i1> poison, <vscale x 2 x i64> poison)
+  %t18 = call <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr align(8) poison, <vscale x 4 x i1> poison, <vscale x 4 x i64> poison)
+  %t19 = call <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr align(8) poison, <vscale x 8 x i1> poison, <vscale x 8 x i64> poison)
+  %t20 = call <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr align(8) poison, <vscale x 16 x i1> poison, <vscale x 16 x i64> poison)
+  ret void
+}
+
+define void @compress_store() {
+; CHECK-LABEL: 'compress_store'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align 8 poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align 8 poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align 8 poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align 8 poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align 8 poison, <vscale x 2 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align 8 poison, <vscale x 4 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align 8 poison, <vscale x 8 x i1> poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align 8 poison, <vscale x 16 x i1> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'compress_store'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align 8 poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align 8 poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align 8 poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align 8 poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align 8 poison, <vscale x 2 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align 8 poison, <vscale x 4 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align 8 poison, <vscale x 8 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align 8 poison, <vscale x 16 x i1> poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call void @llvm.masked.compressstore.v2i8(<2 x i8> poison, ptr poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i8(<4 x i8> poison, ptr poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> poison, ptr poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> poison, ptr poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr align(8) poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr align(8) poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr align(8) poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr align(8) poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> poison, ptr poison, <2 x i1> poison)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> poison, ptr poison, <4 x i1> poison)
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> poison, ptr poison, <8 x i1> poison)
+  call void @llvm.masked.compressstore.v16i64(<16 x i64> poison, ptr poison, <16 x i1> poison)
+  call void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8> poison, ptr poison, <vscale x 2 x i1> poison)
+  call void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8> poison, ptr poison, <vscale x 4 x i1> poison)
+  call void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8> poison, ptr poison, <vscale x 8 x i1> poison)
+  call void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8> poison, ptr poison, <vscale x 16 x i1> poison)
+  call void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64> poison, ptr align(8) poison, <vscale x 2 x i1> poison)
+  call void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64> poison, ptr align(8) poison, <vscale x 4 x i1> poison)
+  call void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64> poison, ptr align(8) poison, <vscale x 8 x i1> poison)
+  call void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64> poison, ptr align(8) poison, <vscale x 16 x i1> poison)
+  ret void
+}
+
+declare <2 x i8> @llvm.masked.expandload.v2i8(ptr, <2 x i1>, <2 x i8>)
+declare <4 x i8> @llvm.masked.expandload.v4i8(ptr, <4 x i1>, <4 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <16 x i64> @llvm.masked.expandload.v16i64(ptr, <16 x i1>, <16 x i64>)
+declare <vscale x 2 x i8> @llvm.masked.expandload.nxv2i8(ptr, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 4 x i8> @llvm.masked.expandload.nxv4i8(ptr, <vscale x 4 x i1>, <vscale x 4 x i8>)
+declare <vscale x 8 x i8> @llvm.masked.expandload.nxv8i8(ptr, <vscale x 8 x i1>, <vscale x 8 x i8>)
+declare <vscale x 16 x i8> @llvm.masked.expandload.nxv16i8(ptr, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare <vscale x 2 x i64> @llvm.masked.expandload.nxv2i64(ptr, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.masked.expandload.nxv4i64(ptr, <vscale x 4 x i1>, <vscale x 4 x i64>)
+declare <vscale x 8 x i64> @llvm.masked.expandload.nxv8i64(ptr, <vscale x 8 x i1>, <vscale x 8 x i64>)
+declare <vscale x 16 x i64> @llvm.masked.expandload.nxv16i64(ptr, <vscale x 16 x i1>, <vscale x 16 x i64>)
+
+declare void @llvm.masked.compressstore.v2i8(<2 x i8>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i8(<4 x i8>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v16i64(<16 x i64>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.nxv2i8(<vscale x 2 x i8>, ptr, <vscale x 2 x i1>)
+declare void @llvm.masked.compressstore.nxv4i8(<vscale x 4 x i8>, ptr, <vscale x 4 x i1>)
+declare void @llvm.masked.compressstore.nxv8i8(<vscale x 8 x i8>, ptr, <vscale x 8 x i1>)
+declare void @llvm.masked.compressstore.nxv16i8(<vscale x 16 x i8>, ptr, <vscale x 16 x i1>)
+declare void @llvm.masked.compressstore.nxv2i64(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>)
+declare void @llvm.masked.compressstore.nxv4i64(<vscale x 4 x i64>, ptr, <vscale x 4 x i1>)
+declare void @llvm.masked.compressstore.nxv8i64(<vscale x 8 x i64>, ptr, <vscale x 8 x i1>)
+declare void @llvm.masked.compressstore.nxv16i64(<vscale x 16 x i64>, ptr, <vscale x 16 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
index 47472d5..d49528d 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index e18a4d0..41f0287 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 4ca1f7f..6f90816 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 214 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 212 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 283 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
index 4fb2f1a..94b9b56 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
index 6be8160..bea77c8 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
@@ -1178,165 +1178,165 @@ define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m
 
 define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_expandload'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_expandload'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_expandload'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_expandload'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_expandload'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_expandload'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> %m4, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> %m2, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> %m1, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> %m16, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> %m8, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> %m4, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> %m2, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> %m8, <8 x double> undef)
@@ -1374,165 +1374,165 @@ define i32 @masked_expandload(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1>
 
 define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_compressstore'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_compressstore'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 260 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_compressstore'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_compressstore'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; SKL-LABEL: 'masked_compressstore'
-; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 139 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 138 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
 ; AVX512-LABEL: 'masked_compressstore'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> %m1)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> %m2)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> %m8)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> %m4)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 347 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> %m64)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> %m32)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> %m16)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> %m8)
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;
   call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> %m8)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 7a67cf3..15ee5e4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -2262,7 +2262,7 @@ declare ptr @llvm.invariant.start.p0(i64, ptr nocapture) readonly nounwind
 declare void @llvm.invariant.end.p0(ptr, i64, ptr nocapture) nounwind
 define void @test_invariant_intrin() {
 ; CHECK-LABEL: name: test_invariant_intrin
-; CHECK: %{{[0-9]+}}:_(s64) = G_IMPLICIT_DEF
+; CHECK: %{{[0-9]+}}:_(p0) = G_IMPLICIT_DEF
 ; CHECK-NEXT: RET_ReallyLR
   %x = alloca %t
   %inv = call ptr @llvm.invariant.start.p0(i64 8, ptr %x)
diff --git a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
index b940734..0a915aa 100644
--- a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
+++ b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
@@ -1,6 +1,8 @@
 # Check that we can remove the redundant save of constant registers such as $wzr
 # RUN: llc -mtriple=aarch64-unknown-linux %s -verify-machineinstrs -start-before=machine-cp -o - | FileCheck %s --check-prefix ASM
 # RUN: llc -mtriple=aarch64-unknown-linux %s -verify-machineinstrs -run-pass=machine-cp -o - | FileCheck %s
+
+# RUN: llc -mtriple=aarch64-unknown-linux %s -passes=machine-cp -o - | FileCheck %s
 --- |
   target triple = "aarch64-unknown-linux"
   declare i32 @bar(i32) nounwind
diff --git a/llvm/test/CodeGen/AArch64/fp8-sme2-cvtn.ll b/llvm/test/CodeGen/AArch64/fp8-sme2-cvtn.ll
new file mode 100644
index 0000000..d1e0729
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-sme2-cvtn.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sme2,+fp8 -enable-subreg-liveness --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @cvtn_f16_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: cvtn_f16_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1h { z2.h, z10.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z3.h, z11.h }, pn8/z, [x8]
+; CHECK-NEXT:    fcvtn z0.b, { z2.h, z3.h }
+; CHECK-NEXT:    fcvtn z1.b, { z10.h, z11.h }
+; CHECK-NEXT:    ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+    %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+    %1 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %0, ptr %ptr)
+    %2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 0
+    %3 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 1
+    %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+    %4 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %0, ptr %arrayidx2)
+    %5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 0
+    %6 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 1
+    %res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> %5)
+    %res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> %3, <vscale x 8 x half> %6)
+    %ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
+    %ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
+    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2
+}
+
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @cvtnt_f32_tuple(i64 %stride, ptr %ptr, <vscale x 16 x i8> %d) {
+; CHECK-LABEL: cvtnt_f32_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ld1w { z2.s, z10.s }, pn8/z, [x1]
+; CHECK-NEXT:    ld1w { z3.s, z11.s }, pn8/z, [x8]
+; CHECK-NEXT:    fcvtnt z0.b, { z2.s, z3.s }
+; CHECK-NEXT:    fcvtnt z1.b, { z10.s, z11.s }
+; CHECK-NEXT:    ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+    %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+    %1 = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
+    %2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %1, 0
+    %3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %1, 1
+    %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+    %4 = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
+    %5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %4, 0
+    %6 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %4, 1
+    %res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %2, <vscale x 4 x float> %5)
+    %res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %3, <vscale x 4 x float> %6)
+    %ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
+    %ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
+    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2
+}
diff --git a/llvm/test/CodeGen/AArch64/luti-with-sme2.ll b/llvm/test/CodeGen/AArch64/luti-with-sme2.ll
new file mode 100644
index 0000000..2d30167
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/luti-with-sme2.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -enable-subreg-liveness -force-streaming -mattr=+sve2,+sme2,+lut,+bf16 | FileCheck %s
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_luti4_lane_i16_x2_tuple(i64 %stride, ptr %ptr, <vscale x 16 x i8> %indices) {
+; CHECK-LABEL: test_luti4_lane_i16_x2_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1h { z3.h, z11.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z4.h, z12.h }, pn8/z, [x8]
+; CHECK-NEXT:    luti4 z2.h, { z3.h, z4.h }, z0[0]
+; CHECK-NEXT:    luti4 z1.h, { z11.h, z12.h }, z0[0]
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+   %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+   %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+   %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+   %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+   %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+   %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+   %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+   %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+   %res1 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 16 x i8> %indices, i32 0)
+   %res2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.luti4.lane.x2.nxv8i16(<vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 16 x i8> %indices, i32 0)
+   %ins1 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> %res1, 0
+   %ins2 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %ins1, <vscale x 8 x i16> %res2, 1
+   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %ins2
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @test_luti4_lane_f16_x2_tuple(i64 %stride, ptr %ptr, <vscale x 16 x i8> %indices) {
+; CHECK-LABEL: test_luti4_lane_f16_x2_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1h { z3.h, z11.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z4.h, z12.h }, pn8/z, [x8]
+; CHECK-NEXT:    luti4 z2.h, { z3.h, z4.h }, z0[0]
+; CHECK-NEXT:    luti4 z1.h, { z11.h, z12.h }, z0[0]
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+   %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+   %1 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %0, ptr %ptr)
+   %2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 0
+   %3 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 1
+   %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+   %4 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %0, ptr %arrayidx2)
+   %5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 0
+   %6 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 1
+   %res1 = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> %5, <vscale x 16 x i8> %indices, i32 0)
+   %res2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.luti4.lane.x2.nxv8f16(<vscale x 8 x half> %3, <vscale x 8 x half> %6, <vscale x 16 x i8> %indices, i32 0)
+   %ins1 = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> %res1, 0
+   %ins2 = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } %ins1, <vscale x 8 x half> %res2, 1
+   ret { <vscale x 8 x half>, <vscale x 8 x half> } %ins2
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_luti4_lane_bf16_x2_tuple(i64 %stride, ptr %ptr, <vscale x 16 x i8> %indices) {
+; CHECK-LABEL: test_luti4_lane_bf16_x2_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1h { z3.h, z11.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z4.h, z12.h }, pn8/z, [x8]
+; CHECK-NEXT:    luti4 z2.h, { z3.h, z4.h }, z0[0]
+; CHECK-NEXT:    luti4 z1.h, { z11.h, z12.h }, z0[0]
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+   %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+   %1 = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %0, ptr %ptr)
+   %2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %1, 0
+   %3 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %1, 1
+   %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+   %4 = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %0, ptr %arrayidx2)
+   %5 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %4, 0
+   %6 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %4, 1
+   %res1 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %5, <vscale x 16 x i8> %indices, i32 0)
+   %res2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.luti4.lane.x2.nxv8bf16(<vscale x 8 x bfloat> %3, <vscale x 8 x bfloat> %6, <vscale x 16 x i8> %indices, i32 0)
+   %ins1 = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> %res1, 0
+   %ins2 = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %ins1, <vscale x 8 x bfloat> %res2, 1
+   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %ins2
+}
diff --git a/llvm/test/CodeGen/AArch64/perm-tb-with-sme2.ll b/llvm/test/CodeGen/AArch64/perm-tb-with-sme2.ll
new file mode 100644
index 0000000..7b55c69
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/perm-tb-with-sme2.ll
@@ -0,0 +1,306 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -enable-subreg-liveness -force-streaming -mattr=+sve2,+sme2 | FileCheck %s
+
+;
+; TBL2
+;
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @tbl2_b_tuple(i64 %stride, ptr %ptr, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: tbl2_b_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    ld1b { z3.b, z11.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z4.b, z12.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    tbl z2.b, { z3.b, z4.b }, z0.b
+; CHECK-NEXT:    tbl z1.b, { z11.b, z12.b }, z0.b
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  %res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.tbl2.nxv16i8(<vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %a)
+  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.tbl2.nxv16i8(<vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> %a)
+  %ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
+  %ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @tbl2_h_tuple(i64 %stride, ptr %ptr, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: tbl2_h_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1h { z3.h, z11.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z4.h, z12.h }, pn8/z, [x8]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    tbl z2.h, { z3.h, z4.h }, z0.h
+; CHECK-NEXT:    tbl z1.h, { z11.h, z12.h }, z0.h
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+  %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+  %res1 = call <vscale x 8 x i16> @llvm.aarch64.sve.tbl2.nxv8i16(<vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> %a)
+  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sve.tbl2.nxv8i16(<vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> %a)
+  %ins1 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> %res1, 0
+  %ins2 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %ins1, <vscale x 8 x i16> %res2, 1
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %ins2
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @tbl2_s_tuple(i64 %stride, ptr %ptr, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: tbl2_s_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1w { z3.s, z11.s }, pn8/z, [x1]
+; CHECK-NEXT:    ld1w { z4.s, z12.s }, pn8/z, [x8]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    tbl z2.s, { z3.s, z4.s }, z0.s
+; CHECK-NEXT:    tbl z1.s, { z11.s, z12.s }, z0.s
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %1, 0
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %4, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %4, 1
+  %res1 = call <vscale x 4 x i32> @llvm.aarch64.sve.tbl2.nxv4i32(<vscale x 4 x i32> %2, <vscale x 4 x i32> %5, <vscale x 4 x i32> %a)
+  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sve.tbl2.nxv4i32(<vscale x 4 x i32> %3, <vscale x 4 x i32> %6, <vscale x 4 x i32> %a)
+  %ins1 = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> %res1, 0
+  %ins2 = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ins1, <vscale x 4 x i32> %res2, 1
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %ins2
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @tbl2_d_tuple(i64 %stride, ptr %ptr, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: tbl2_d_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1d { z3.d, z11.d }, pn8/z, [x1]
+; CHECK-NEXT:    ld1d { z4.d, z12.d }, pn8/z, [x8]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    tbl z2.d, { z3.d, z4.d }, z0.d
+; CHECK-NEXT:    tbl z1.d, { z11.d, z12.d }, z0.d
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 0
+  %3 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %4, 0
+  %6 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %4, 1
+  %res1 = call <vscale x 2 x i64> @llvm.aarch64.sve.tbl2.nxv2i64(<vscale x 2 x i64> %2, <vscale x 2 x i64> %5, <vscale x 2 x i64> %a)
+  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sve.tbl2.nxv2i64(<vscale x 2 x i64> %3, <vscale x 2 x i64> %6, <vscale x 2 x i64> %a)
+  %ins1 = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> %res1, 0
+  %ins2 = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %ins1, <vscale x 2 x i64> %res2, 1
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %ins2
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @tbl2_bf16_tuple(i64 %stride, ptr %ptr, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: tbl2_bf16_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1h { z3.h, z11.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z4.h, z12.h }, pn8/z, [x8]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    tbl z2.h, { z3.h, z4.h }, z0.h
+; CHECK-NEXT:    tbl z1.h, { z11.h, z12.h }, z0.h
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %1, 0
+  %3 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %4, 0
+  %6 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %4, 1
+  %res1 = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl2.nxv8bf16(<vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %5, <vscale x 8 x i16> %a)
+  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl2.nxv8bf16(<vscale x 8 x bfloat> %3, <vscale x 8 x bfloat> %6, <vscale x 8 x i16> %a)
+  %ins1 = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } poison, <vscale x 8 x bfloat> %res1, 0
+  %ins2 = insertvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %ins1, <vscale x 8 x bfloat> %res2, 1
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %ins2
+}
+
+define { <vscale x 4 x float>, <vscale x 4 x float> } @tbl2_f32_tuple(i64 %stride, ptr %ptr, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: tbl2_f32_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1w { z3.s, z11.s }, pn8/z, [x1]
+; CHECK-NEXT:    ld1w { z4.s, z12.s }, pn8/z, [x8]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    tbl z2.s, { z3.s, z4.s }, z0.s
+; CHECK-NEXT:    tbl z1.s, { z11.s, z12.s }, z0.s
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %1, 0
+  %3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %4, 0
+  %6 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %4, 1
+  %res1 = call <vscale x 4 x float> @llvm.aarch64.sve.tbl2.nxv4f32(<vscale x 4 x float> %2, <vscale x 4 x float> %5, <vscale x 4 x i32> %a)
+  %res2 = call <vscale x 4 x float> @llvm.aarch64.sve.tbl2.nxv4f32(<vscale x 4 x float> %3, <vscale x 4 x float> %6, <vscale x 4 x i32> %a)
+  %ins1 = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> %res1, 0
+  %ins2 = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } %ins1, <vscale x 4 x float> %res2, 1
+  ret { <vscale x 4 x float>, <vscale x 4 x float> } %ins2
+}
+
+define { <vscale x 2 x double>, <vscale x 2 x double> } @tbl2_f64_tuple(i64 %stride, ptr %ptr, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: tbl2_f64_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1d { z3.d, z11.d }, pn8/z, [x1]
+; CHECK-NEXT:    ld1d { z4.d, z12.d }, pn8/z, [x8]
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    tbl z2.d, { z3.d, z4.d }, z0.d
+; CHECK-NEXT:    tbl z1.d, { z11.d, z12.d }, z0.d
+; CHECK-NEXT:    ldr z12, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 0
+  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
+  %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
+  %res1 = call <vscale x 2 x double> @llvm.aarch64.sve.tbl2.nxv2f64(<vscale x 2 x double> %2, <vscale x 2 x double> %5, <vscale x 2 x i64> %a)
+  %res2 = call <vscale x 2 x double> @llvm.aarch64.sve.tbl2.nxv2f64(<vscale x 2 x double> %3, <vscale x 2 x double> %6, <vscale x 2 x i64> %a)
+  %ins1 = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> %res1, 0
+  %ins2 = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } %ins1, <vscale x 2 x double> %res2, 1
+  ret { <vscale x 2 x double>, <vscale x 2 x double> } %ins2
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.tbl2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.tbl2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.tbl2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.tbl2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.tbl2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i16>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.tbl2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tbl2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i64>)
+
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.tbl2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i16>)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve2,+bf16" }
diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
index 38d3bed..b0390ec 100644
--- a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
@@ -1,13 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+fp8 -verify-machineinstrs -force-streaming < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+fp8 -verify-machineinstrs -force-streaming -enable-subreg-liveness < %s | FileCheck %s
 
 ; FCVT / FCVTN / BFCVT
 
 define <vscale x 16 x i8> @fcvt_x2(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) {
 ; CHECK-LABEL: fcvt_x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fcvt z0.b, { z0.h, z1.h }
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1)
@@ -17,10 +15,6 @@ define <vscale x 16 x i8> @fcvt_x2(<vscale x 8 x half> %zn0, <vscale x 8 x half>
 define <vscale x 16 x i8> @fcvt_x4(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
 ; CHECK-LABEL: fcvt_x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    fcvt z0.b, { z0.s - z3.s }
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x4(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
@@ -28,13 +22,88 @@ define <vscale x 16 x i8> @fcvt_x4(<vscale x 4 x float> %zn0, <vscale x 4 x floa
   ret <vscale x 16 x i8> %res
 }
 
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @fcvt_x4_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: fcvt_x4_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    add x9, x1, x0
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x1]
+; CHECK-NEXT:    ld1w { z17.s, z21.s, z25.s, z29.s }, pn8/z, [x9]
+; CHECK-NEXT:    add x10, x1, x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z18.s, z22.s, z26.s, z30.s }, pn8/z, [x10]
+; CHECK-NEXT:    ld1w { z19.s, z23.s, z27.s, z31.s }, pn8/z, [x8]
+; CHECK-NEXT:    fcvt z0.b, { z16.s - z19.s }
+; CHECK-NEXT:    fcvt z1.b, { z20.s - z23.s }
+; CHECK-NEXT:    fcvt z2.b, { z24.s - z27.s }
+; CHECK-NEXT:    fcvt z3.b, { z28.s - z31.s }
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 0
+  %3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 1
+  %4 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 2
+  %5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 0
+  %8 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 1
+  %9 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 2
+  %10 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 0
+  %13 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 1
+  %14 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 2
+  %15 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 0
+  %18 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 1
+  %19 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 2
+  %20 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 3
+  %res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x4(<vscale x 4 x float> %2, <vscale x 4 x float> %7, <vscale x 4 x float> %12, <vscale x 4 x float> %17)
+  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x4(<vscale x 4 x float> %3, <vscale x 4 x float> %8, <vscale x 4 x float> %13, <vscale x 4 x float> %18)
+  %res3 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x4(<vscale x 4 x float> %4, <vscale x 4 x float> %9, <vscale x 4 x float> %14, <vscale x 4 x float> %19)
+  %res4 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x4(<vscale x 4 x float> %5, <vscale x 4 x float> %10, <vscale x 4 x float> %15, <vscale x 4 x float> %20)
+  %ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
+  %ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
+  %ins3 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2, <vscale x 16 x i8> %res3, 2
+  %ins4 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ins3, <vscale x 16 x i8> %res4, 3
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ins4
+}
+
 define <vscale x 16 x i8> @fcvtn(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
 ; CHECK-LABEL: fcvtn:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    fcvtn z0.b, { z0.s - z3.s }
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.x4(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
@@ -45,14 +114,53 @@ define <vscale x 16 x i8> @fcvtn(<vscale x 4 x float> %zn0, <vscale x 4 x float>
 define <vscale x 16 x i8> @bfcvt(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) {
 ; CHECK-LABEL: bfcvt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfcvt z0.b, { z0.h, z1.h }
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1)
   ret <vscale x 16 x i8> %res
 }
 
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @bfcvt_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: bfcvt_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1h { z2.h, z10.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z3.h, z11.h }, pn8/z, [x8]
+; CHECK-NEXT:    bfcvt z0.b, { z2.h, z3.h }
+; CHECK-NEXT:    bfcvt z1.b, { z10.h, z11.h }
+; CHECK-NEXT:    ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %1, 0
+  %3 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %4, 0
+  %6 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %4, 1
+  %res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8bf16(<vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %5)
+  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvt.x2.nxv8bf16(<vscale x 8 x bfloat> %3, <vscale x 8 x bfloat> %6)
+  %ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
+  %ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2
+}
+
 ; F1CVT / F2CVT
 
 define { <vscale x 8 x half>, <vscale x 8 x half> } @f1cvt(<vscale x 16 x i8> %zm) {
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
index eee577c..1546763 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; ADD Multi-Single x2
@@ -8,9 +8,7 @@
 define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,  <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
 ; CHECK-NEXT:    add za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
 ; CHECK-NEXT:    ret
@@ -27,9 +25,7 @@ define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4
 define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,  <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
 ; CHECK-NEXT:    add za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
 ; CHECK-NEXT:    ret
@@ -50,11 +46,7 @@ define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2
 define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    add za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
 ; CHECK-NEXT:    add za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
 ; CHECK-NEXT:    ret
@@ -75,11 +67,7 @@ define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4
 define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    add za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
 ; CHECK-NEXT:    add za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
 ; CHECK-NEXT:    ret
@@ -105,11 +93,7 @@ define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
 define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
 ; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    add za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
@@ -128,11 +112,7 @@ define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32>
 define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
 ; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
 ; CHECK-NEXT:    add za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
 ; CHECK-NEXT:    ret
@@ -155,15 +135,7 @@ define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64>
 define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
 ; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    add za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    add za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
@@ -187,15 +159,7 @@ define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32>
 define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
 ; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    add za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
 ; CHECK-NEXT:    add za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
 ; CHECK-NEXT:    ret
@@ -223,9 +187,7 @@ define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64>
 define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }
 ; CHECK-NEXT:    add za.s[w8, 7, vgx2], { z0.s, z1.s }
 ; CHECK-NEXT:    ret
@@ -238,9 +200,7 @@ define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0,
 define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    add za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -253,9 +213,7 @@ define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0,
 define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x2_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fadd za.s[w8, 0, vgx2], { z0.s, z1.s }
 ; CHECK-NEXT:    fadd za.s[w8, 7, vgx2], { z0.s, z1.s }
 ; CHECK-NEXT:    ret
@@ -270,9 +228,7 @@ define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0
 define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x2_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fadd za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    fadd za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -284,16 +240,37 @@ define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn
   ret void
 }
 
+define void @multi_vector_add_za_vg1x2_f64_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: multi_vector_add_za_vg1x2_f64_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x9, x1, x0
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1d { z16.d, z24.d }, pn8/z, [x1]
+; CHECK-NEXT:    ld1d { z17.d, z25.d }, pn8/z, [x9]
+; CHECK-NEXT:    fadd za.d[w8, 0, vgx2], { z16.d, z17.d }
+; CHECK-NEXT:    fadd za.d[w8, 0, vgx2], { z24.d, z25.d }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 0
+  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
+  %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %5)
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %6)
+  ret void
+}
+
 ; x4
 
 define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x4_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    add za.s[w8, 0, vgx4], { z0.s - z3.s }
 ; CHECK-NEXT:    add za.s[w8, 7, vgx4], { z0.s - z3.s }
 ; CHECK-NEXT:    ret
@@ -310,11 +287,7 @@ define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0,
 define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x4_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    add za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    add za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -331,11 +304,7 @@ define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0,
 define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x4_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    fadd za.s[w8, 0, vgx4], { z0.s - z3.s }
 ; CHECK-NEXT:    fadd za.s[w8, 7, vgx4], { z0.s - z3.s }
 ; CHECK-NEXT:    ret
@@ -349,14 +318,62 @@ define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0
   ret void
 }
 
+define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: multi_vector_add_za_vg1x4_f32_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x0, #1
+; CHECK-NEXT:    add x10, x1, x0
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x1]
+; CHECK-NEXT:    ld1w { z17.s, z21.s, z25.s, z29.s }, pn8/z, [x10]
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    add x11, x1, x9
+; CHECK-NEXT:    add x9, x10, x9
+; CHECK-NEXT:    ld1w { z18.s, z22.s, z26.s, z30.s }, pn8/z, [x11]
+; CHECK-NEXT:    ld1w { z19.s, z23.s, z27.s, z31.s }, pn8/z, [x9]
+; CHECK-NEXT:    fadd za.s[w8, 0, vgx4], { z16.s - z19.s }
+; CHECK-NEXT:    fadd za.s[w8, 0, vgx4], { z20.s - z23.s }
+; CHECK-NEXT:    fadd za.s[w8, 0, vgx4], { z24.s - z27.s }
+; CHECK-NEXT:    fadd za.s[w8, 0, vgx4], { z28.s - z31.s }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 0
+  %3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 1
+  %4 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 2
+  %5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 0
+  %8 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 1
+  %9 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 2
+  %10 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 0
+  %13 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 1
+  %14 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 2
+  %15 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 0
+  %18 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 1
+  %19 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 2
+  %20 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 3
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %2, <vscale x 4 x float> %7, <vscale x 4 x float> %12, <vscale x 4 x float> %17)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %3, <vscale x 4 x float> %8, <vscale x 4 x float> %13, <vscale x 4 x float> %18)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %4, <vscale x 4 x float> %9, <vscale x 4 x float> %14, <vscale x 4 x float> %19)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %5, <vscale x 4 x float> %10, <vscale x 4 x float> %15, <vscale x 4 x float> %20)
+  ret void
+}
+
 define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x4_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    fadd za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    fadd za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
index ca149f4..416b848 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
 
 target triple="aarch64-linux-gnu"
 
@@ -23,21 +23,44 @@ define void @fdot_multi_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @fdot_multi_za32_f16_vg1x2_tuple(i64 %stride, ptr %ptr) #0 {
+; CHECK-LABEL: fdot_multi_za32_f16_vg1x2_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x9, x1, x0
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1h { z16.h, z24.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z17.h, z25.h }, pn8/z, [x9]
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z16.h, z17.h }, { z24.h, z25.h }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 0
+  %3 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 0
+  %6 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 1
+  call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 0, <vscale x 8 x half> %2, <vscale x 8 x half> %5, <vscale x 8 x half> %3, <vscale x 8 x half> %6)
+  ret void
+}
+
 define void @fdot_multi_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
 ; CHECK-LABEL: fdot_multi_za32_f16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
                                         <vscale x 8 x half> %zn4, <vscale x 8 x half> %zn5, <vscale x 8 x half> %zn6, <vscale x 8 x half> %zn7) #0 {
   call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
@@ -48,6 +71,54 @@ define void @fdot_multi_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @fdot_multi_za32_f16_vg1x4_tuple(i64 %stride, ptr %ptr) #0 {
+; CHECK-LABEL: fdot_multi_za32_f16_vg1x4_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add x9, x0, x0, lsl #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x10, x1, x0
+; CHECK-NEXT:    ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z17.h, z21.h, z25.h, z29.h }, pn8/z, [x10]
+; CHECK-NEXT:    ld1h { z18.h, z22.h, z26.h, z30.h }, pn8/z, [x1, x0, lsl #1]
+; CHECK-NEXT:    add x9, x1, x9
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1h { z19.h, z23.h, z27.h, z31.h }, pn8/z, [x9]
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z24.h - z27.h }, { z28.h - z31.h }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %1, 0
+  %3 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %1, 1
+  %4 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %1, 2
+  %5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %6, 0
+  %8 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %6, 1
+  %9 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %6, 2
+  %10 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %11, 0
+  %13 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %11, 1
+  %14 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %11, 2
+  %15 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %16, 0
+  %18 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %16, 1
+  %19 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %16, 2
+  %20 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %16, 3
+  call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 0, <vscale x 8 x half> %2, <vscale x 8 x half> %7, <vscale x 8 x half> %12, <vscale x 8 x half> %17,
+                                                      <vscale x 8 x half> %3, <vscale x 8 x half> %8, <vscale x 8 x half> %13, <vscale x 8 x half> %18)
+  call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 0, <vscale x 8 x half> %4, <vscale x 8 x half> %9, <vscale x 8 x half> %14, <vscale x 8 x half> %19,
+                                                      <vscale x 8 x half> %5, <vscale x 8 x half> %10, <vscale x 8 x half> %15, <vscale x 8 x half> %20)
+  ret void
+}
 
 ; == Multi, multi (16-bit bfloat) ==
 
@@ -71,18 +142,18 @@ define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
 ; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z25.d, z6.d
-; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT:    mov z28.d, z1.d
-; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT:    bfdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT:    bfdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
                                         <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zn5, <vscale x 8 x bfloat> %zn6, <vscale x 8 x bfloat> %zn7) #0 {
   call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
@@ -99,9 +170,7 @@ define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @fdot_single_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) #0 {
 ; CHECK-LABEL: fdot_single_za32_f16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    fdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    ret
@@ -114,11 +183,7 @@ define void @fdot_single_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @fdot_single_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) #0 {
 ; CHECK-LABEL: fdot_single_za32_f16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    ret
@@ -134,9 +199,7 @@ define void @fdot_single_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @bfdot_single_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) #0 {
 ; CHECK-LABEL: bfdot_single_za32_bf16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    bfdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    ret
@@ -149,11 +212,7 @@ define void @bfdot_single_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused
 define void @bfdot_single_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) #0 {
 ; CHECK-LABEL: bfdot_single_za32_bf16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    bfdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
 ; CHECK-NEXT:    ret
@@ -170,8 +229,8 @@ define void @fdot_lane_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: fdot_lane_za32_f16_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    fdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    ret
@@ -185,8 +244,8 @@ define void @fdot_lane_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 ; CHECK-LABEL: fdot_lane_za32_f16_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
@@ -207,8 +266,8 @@ define void @bfdot_lane_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 ; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    bfdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    ret
@@ -222,8 +281,8 @@ define void @bfdot_lane_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 ; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
index c7a2ec1..2e4a5f0 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; Move Multi-Vector To Tile (Write) x 2
@@ -10,9 +10,7 @@
 define void @za_write_vg2_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.b[w12, 0:1], { z0.b, z1.b }
 ; CHECK-NEXT:    mov za0h.b[w12, 14:15], { z0.b, z1.b }
 ; CHECK-NEXT:    ret
@@ -22,12 +20,34 @@ define void @za_write_vg2_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x
   ret void
 }
 
+define void @za_write_vg2_horiz_b_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: za_write_vg2_horiz_b_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w12, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    mov za0h.b[w12, 0:1], { z16.b, z17.b }
+; CHECK-NEXT:    mov za0h.b[w12, 0:1], { z24.b, z25.b }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5)
+  call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6)
+  ret void
+}
+
 define void @za_write_vg2_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 6:7], { z0.h, z1.h }
 ; CHECK-NEXT:    ret
@@ -40,9 +60,7 @@ define void @za_write_vg2_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x
 define void @za_write_vg2_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 6:7], { z0.h, z1.h }
 ; CHECK-NEXT:    ret
@@ -55,9 +73,7 @@ define void @za_write_vg2_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscal
 define void @za_write_vg2_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 6:7], { z0.h, z1.h }
 ; CHECK-NEXT:    ret
@@ -70,9 +86,7 @@ define void @za_write_vg2_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vs
 define void @za_write_vg2_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.s[w12, 0:1], { z0.s, z1.s }
 ; CHECK-NEXT:    mov za3h.s[w12, 2:3], { z0.s, z1.s }
 ; CHECK-NEXT:    ret
@@ -85,9 +99,7 @@ define void @za_write_vg2_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x
 define void @za_write_vg2_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.s[w12, 0:1], { z0.s, z1.s }
 ; CHECK-NEXT:    mov za3h.s[w12, 2:3], { z0.s, z1.s }
 ; CHECK-NEXT:    ret
@@ -100,9 +112,7 @@ define void @za_write_vg2_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vsca
 define void @za_write_vg2_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.d[w12, 0:1], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2)
@@ -112,9 +122,7 @@ define void @za_write_vg2_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x
 define void @za_write_vg2_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.d[w12, 0:1], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2)
@@ -126,9 +134,7 @@ define void @za_write_vg2_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vsc
 define void @za_write_vg2_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.b[w12, 0:1], { z0.b, z1.b }
 ; CHECK-NEXT:    mov za0v.b[w12, 14:15], { z0.b, z1.b }
 ; CHECK-NEXT:    ret
@@ -141,9 +147,7 @@ define void @za_write_vg2_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x
 define void @za_write_vg2_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 6:7], { z0.h, z1.h }
 ; CHECK-NEXT:    ret
@@ -156,9 +160,7 @@ define void @za_write_vg2_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x
 define void @za_write_vg2_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 6:7], { z0.h, z1.h }
 ; CHECK-NEXT:    ret
@@ -171,9 +173,7 @@ define void @za_write_vg2_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale
 define void @za_write_vg2_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 6:7], { z0.h, z1.h }
 ; CHECK-NEXT:    ret
@@ -186,9 +186,7 @@ define void @za_write_vg2_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vsc
 define void @za_write_vg2_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.s[w12, 0:1], { z0.s, z1.s }
 ; CHECK-NEXT:    mov za3v.s[w12, 2:3], { z0.s, z1.s }
 ; CHECK-NEXT:    ret
@@ -201,9 +199,7 @@ define void @za_write_vg2_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x
 define void @za_write_vg2_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.s[w12, 0:1], { z0.s, z1.s }
 ; CHECK-NEXT:    mov za3v.s[w12, 2:3], { z0.s, z1.s }
 ; CHECK-NEXT:    ret
@@ -216,9 +212,7 @@ define void @za_write_vg2_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscal
 define void @za_write_vg2_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.d[w12, 0:1], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2)
@@ -228,9 +222,7 @@ define void @za_write_vg2_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x
 define void @za_write_vg2_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.d[w12, 0:1], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2)
@@ -246,11 +238,7 @@ define void @za_write_vg2_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vsca
 define void @za_write_vg4_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) {
 ; CHECK-LABEL: za_write_vg4_horiz_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0h.b[w12, 0:3], { z0.b - z3.b }
 ; CHECK-NEXT:    mov za0h.b[w12, 12:15], { z0.b - z3.b }
 ; CHECK-NEXT:    ret
@@ -263,11 +251,7 @@ define void @za_write_vg4_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x
 define void @za_write_vg4_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) {
 ; CHECK-LABEL: za_write_vg4_horiz_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0h.h[w12, 0:3], { z0.h - z3.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 4:7], { z0.h - z3.h }
 ; CHECK-NEXT:    ret
@@ -280,11 +264,7 @@ define void @za_write_vg4_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x
 define void @za_write_vg4_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) {
 ; CHECK-LABEL: za_write_vg4_horiz_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0h.h[w12, 0:3], { z0.h - z3.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 4:7], { z0.h - z3.h }
 ; CHECK-NEXT:    ret
@@ -297,11 +277,7 @@ define void @za_write_vg4_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscal
 define void @za_write_vg4_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) {
 ; CHECK-LABEL: za_write_vg4_horiz_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0h.h[w12, 0:3], { z0.h - z3.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 4:7], { z0.h - z3.h }
 ; CHECK-NEXT:    ret
@@ -314,11 +290,7 @@ define void @za_write_vg4_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vs
 define void @za_write_vg4_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
 ; CHECK-LABEL: za_write_vg4_horiz_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0h.s[w12, 0:3], { z0.s - z3.s }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
@@ -328,11 +300,7 @@ define void @za_write_vg4_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x
 define void @za_write_vg4_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
 ; CHECK-LABEL: za_write_vg4_horiz_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0h.s[w12, 0:3], { z0.s - z3.s }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
@@ -342,11 +310,7 @@ define void @za_write_vg4_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vsca
 define void @za_write_vg4_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
 ; CHECK-LABEL: za_write_vg4_horiz_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0h.d[w12, 0:3], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
@@ -356,11 +320,7 @@ define void @za_write_vg4_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x
 define void @za_write_vg4_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) {
 ; CHECK-LABEL: za_write_vg4_horiz_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0h.d[w12, 0:3], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4)
@@ -372,11 +332,7 @@ define void @za_write_vg4_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vsc
 define void @za_write_vg4_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) {
 ; CHECK-LABEL: za_write_vg4_vert_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0v.b[w12, 0:3], { z0.b - z3.b }
 ; CHECK-NEXT:    mov za0v.b[w12, 12:15], { z0.b - z3.b }
 ; CHECK-NEXT:    ret
@@ -389,11 +345,7 @@ define void @za_write_vg4_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x
 define void @za_write_vg4_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) {
 ; CHECK-LABEL: za_write_vg4_vert_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0v.h[w12, 0:3], { z0.h - z3.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 4:7], { z0.h - z3.h }
 ; CHECK-NEXT:    ret
@@ -406,11 +358,7 @@ define void @za_write_vg4_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x
 define void @za_write_vg4_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) {
 ; CHECK-LABEL: za_write_vg4_vert_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0v.h[w12, 0:3], { z0.h - z3.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 4:7], { z0.h - z3.h }
 ; CHECK-NEXT:    ret
@@ -423,11 +371,7 @@ define void @za_write_vg4_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale
 define void @za_write_vg4_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) {
 ; CHECK-LABEL: za_write_vg4_vert_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0v.h[w12, 0:3], { z0.h - z3.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 4:7], { z0.h - z3.h }
 ; CHECK-NEXT:    ret
@@ -440,11 +384,7 @@ define void @za_write_vg4_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vsc
 define void @za_write_vg4_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
 ; CHECK-LABEL: za_write_vg4_vert_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0v.s[w12, 0:3], { z0.s - z3.s }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
@@ -454,11 +394,7 @@ define void @za_write_vg4_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x
 define void @za_write_vg4_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
 ; CHECK-LABEL: za_write_vg4_vert_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0v.s[w12, 0:3], { z0.s - z3.s }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
@@ -468,11 +404,7 @@ define void @za_write_vg4_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscal
 define void @za_write_vg4_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
 ; CHECK-LABEL: za_write_vg4_vert_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0v.d[w12, 0:3], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
@@ -482,17 +414,65 @@ define void @za_write_vg4_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x
 define void @za_write_vg4_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) {
 ; CHECK-LABEL: za_write_vg4_vert_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w12, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za0v.d[w12, 0:3], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4)
   ret void
 }
 
+define void @za_write_vg4_vert_f64_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: za_write_vg4_vert_f64_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    add x9, x1, x0
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x1]
+; CHECK-NEXT:    ld1d { z17.d, z21.d, z25.d, z29.d }, pn8/z, [x9]
+; CHECK-NEXT:    mov w12, wzr
+; CHECK-NEXT:    add x10, x1, x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1d { z18.d, z22.d, z26.d, z30.d }, pn8/z, [x10]
+; CHECK-NEXT:    ld1d { z19.d, z23.d, z27.d, z31.d }, pn8/z, [x8]
+; CHECK-NEXT:    mov za0v.d[w12, 0:3], { z16.d - z19.d }
+; CHECK-NEXT:    mov za0v.d[w12, 0:3], { z20.d - z23.d }
+; CHECK-NEXT:    mov za0v.d[w12, 0:3], { z24.d - z27.d }
+; CHECK-NEXT:    mov za0v.d[w12, 0:3], { z28.d - z31.d }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 0
+  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 1
+  %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 2
+  %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
+  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
+  %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 2
+  %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
+  %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 1
+  %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 2
+  %15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 0
+  %18 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 1
+  %19 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 2
+  %20 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 3
+  call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %7, <vscale x 2 x double> %12, <vscale x 2 x double> %17)
+  call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %8, <vscale x 2 x double> %13, <vscale x 2 x double> %18)
+  call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %4, <vscale x 2 x double> %9, <vscale x 2 x double> %14, <vscale x 2 x double> %19)
+  call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %5, <vscale x 2 x double> %10, <vscale x 2 x double> %15, <vscale x 2 x double> %20)
+  ret void
+}
+
 ;
 ; Move Multi-Vector To ZA (Write) x2
 ;
@@ -500,9 +480,7 @@ define void @za_write_vg4_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vsca
 define void @za_write_vg1x2_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -512,12 +490,34 @@ define void @za_write_vg1x2_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16
   ret void
 }
 
+define void @za_write_vg1x2_b_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: za_write_vg1x2_b_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z16.d, z17.d }
+; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z24.d, z25.d }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5)
+  call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6)
+  ret void
+}
+
 define void @za_write_vg1x2_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -530,9 +530,7 @@ define void @za_write_vg1x2_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x
 define void @za_write_vg1x2_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -545,9 +543,7 @@ define void @za_write_vg1x2_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x
 define void @za_write_vg1x2_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -560,9 +556,7 @@ define void @za_write_vg1x2_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale
 define void @za_write_vg1x2_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -575,9 +569,7 @@ define void @za_write_vg1x2_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x
 define void @za_write_vg1x2_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -590,9 +582,7 @@ define void @za_write_vg1x2_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x
 define void @za_write_vg1x2_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -605,9 +595,7 @@ define void @za_write_vg1x2_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x
 define void @za_write_vg1x2_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -624,11 +612,7 @@ define void @za_write_vg1x2_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale
 define void @za_write_vg1x4_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) {
 ; CHECK-LABEL: za_write_vg1x4_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -641,11 +625,7 @@ define void @za_write_vg1x4_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16
 define void @za_write_vg1x4_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) {
 ; CHECK-LABEL: za_write_vg1x4_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -658,11 +638,7 @@ define void @za_write_vg1x4_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x
 define void @za_write_vg1x4_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) {
 ; CHECK-LABEL: za_write_vg1x4_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -675,11 +651,7 @@ define void @za_write_vg1x4_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x
 define void @za_write_vg1x4_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) {
 ; CHECK-LABEL: za_write_vg1x4_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -692,11 +664,7 @@ define void @za_write_vg1x4_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale
 define void @za_write_vg1x4_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) {
 ; CHECK-LABEL: za_write_vg1x4_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -709,11 +677,7 @@ define void @za_write_vg1x4_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x
 define void @za_write_vg1x4_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) {
 ; CHECK-LABEL: za_write_vg1x4_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -726,11 +690,7 @@ define void @za_write_vg1x4_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x
 define void @za_write_vg1x4_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) {
 ; CHECK-LABEL: za_write_vg1x4_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -743,11 +703,7 @@ define void @za_write_vg1x4_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x
 define void @za_write_vg1x4_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) {
 ; CHECK-LABEL: za_write_vg1x4_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx4], { z0.d - z3.d }
 ; CHECK-NEXT:    ret
@@ -757,6 +713,58 @@ define void @za_write_vg1x4_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale
   ret void
 }
 
+define void @za_write_vg1x4_f64_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: za_write_vg1x4_f64_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x0, #1
+; CHECK-NEXT:    add x10, x1, x0
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x1]
+; CHECK-NEXT:    ld1d { z17.d, z21.d, z25.d, z29.d }, pn8/z, [x10]
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    add x11, x1, x9
+; CHECK-NEXT:    add x9, x10, x9
+; CHECK-NEXT:    ld1d { z18.d, z22.d, z26.d, z30.d }, pn8/z, [x11]
+; CHECK-NEXT:    ld1d { z19.d, z23.d, z27.d, z31.d }, pn8/z, [x9]
+; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z16.d - z19.d }
+; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z20.d - z23.d }
+; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z24.d - z27.d }
+; CHECK-NEXT:    mov za.d[w8, 0, vgx4], { z28.d - z31.d }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 0
+  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 1
+  %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 2
+  %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
+  %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
+  %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 2
+  %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
+  %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 1
+  %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 2
+  %15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 0
+  %18 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 1
+  %19 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 2
+  %20 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 3
+  call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %7, <vscale x 2 x double> %12, <vscale x 2 x double> %17)
+  call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %8, <vscale x 2 x double> %13, <vscale x 2 x double> %18)
+  call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %4, <vscale x 2 x double> %9, <vscale x 2 x double> %14, <vscale x 2 x double> %19)
+  call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %5, <vscale x 2 x double> %10, <vscale x 2 x double> %15, <vscale x 2 x double> %20)
+  ret void
+}
+
 declare void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index d8d796e..fba81ea 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -23,6 +23,28 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @udot_multi_za32_u16_vg1x2_tuple(i64 %stride, ptr %ptr) #1 {
+; CHECK-LABEL: udot_multi_za32_u16_vg1x2_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z16.b, z17.b }, { z24.b, z25.b }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6)
+  ret void
+}
+
 define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
@@ -48,6 +70,54 @@ define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @udot_multi_za32_u16_vg1x4_tuple(i64 %stride, ptr %ptr) #1 {
+; CHECK-LABEL: udot_multi_za32_u16_vg1x4_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x0, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    add x10, x9, x0
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x1, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x1, x10]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z16.b - z19.b }, { z20.b - z23.b }
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z24.b - z27.b }, { z28.b - z31.b }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17,
+                                                      <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18)
+  call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19,
+                                                      <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20)
+  ret void
+}
+
 define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
 ; CHECK-LABEL: udot_multi_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
@@ -149,6 +219,28 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @usdot_multi_za32_u16_vg1x2_tuple(i64 %stride, ptr %ptr) #1 {
+; CHECK-LABEL: usdot_multi_za32_u16_vg1x2_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, { z24.b, z25.b }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6)
+  ret void
+}
+
 define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
@@ -174,6 +266,53 @@ define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @usdot_multi_za32_u16_vg1x4_tuple(i64 %stride, ptr %ptr) #1 {
+; CHECK-LABEL: usdot_multi_za32_u16_vg1x4_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x0, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    add x10, x9, x0
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x1, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x1, x10]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, { z20.b - z23.b }
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, { z28.b - z31.b }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17,
+                                                      <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18)
+  call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19,
+                                                      <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20)
+  ret void
+}
 
 ; == Multi, multi (signed) ==
 
@@ -194,6 +333,28 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @sdot_multi_za32_u16_vg1x2_tuple(i64 %stride, ptr %ptr) #0 {
+; CHECK-LABEL: sdot_multi_za32_u16_vg1x2_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, { z24.b, z25.b }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6)
+  ret void
+}
+
 define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
@@ -219,6 +380,54 @@ define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @sdot_multi_za32_u16_vg1x4_tuple(i64 %stride, ptr %ptr) #0 {
+; CHECK-LABEL: sdot_multi_za32_u16_vg1x4_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl x9, x0, #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x1]
+; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x1, x0]
+; CHECK-NEXT:    add x10, x9, x0
+; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x1, x9]
+; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x1, x10]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, { z20.b - z23.b }
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, { z28.b - z31.b }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17,
+                                                      <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18)
+  call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19,
+                                                      <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20)
+  ret void
+}
+
 define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
@@ -278,6 +487,29 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @sdot_multi_za64_u16_vg1x2_tuple(i64 %stride, ptr %ptr) #1 {
+; CHECK-LABEL: sdot_multi_za64_u16_vg1x2_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x9, x1, x0
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1h { z16.h, z24.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z17.h, z25.h }, pn8/z, [x9]
+; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z16.h, z17.h }, { z24.h, z25.h }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+  %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+  call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6)
+  ret void
+}
+
 define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
@@ -303,6 +535,54 @@ define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
   ret void
 }
 
+define void @sdot_multi_za64_u16_vg1x4_tuple(i64 %stride, ptr %ptr) #1 {
+; CHECK-LABEL: sdot_multi_za64_u16_vg1x4_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add x9, x0, x0, lsl #1
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x10, x1, x0
+; CHECK-NEXT:    ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x1]
+; CHECK-NEXT:    ld1h { z17.h, z21.h, z25.h, z29.h }, pn8/z, [x10]
+; CHECK-NEXT:    ld1h { z18.h, z22.h, z26.h, z30.h }, pn8/z, [x1, x0, lsl #1]
+; CHECK-NEXT:    add x9, x1, x9
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    ld1h { z19.h, z23.h, z27.h, z31.h }, pn8/z, [x9]
+; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
+; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, { z28.h - z31.h }
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+  %4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 2
+  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 0
+  %8 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 1
+  %9 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 2
+  %10 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 0
+  %13 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 1
+  %14 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 2
+  %15 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 0
+  %18 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 1
+  %19 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 2
+  %20 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 3
+  call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %7, <vscale x 8 x i16> %12, <vscale x 8 x i16> %17,
+                                                      <vscale x 8 x i16> %3, <vscale x 8 x i16> %8, <vscale x 8 x i16> %13, <vscale x 8 x i16> %18)
+  call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %4, <vscale x 8 x i16> %9, <vscale x 8 x i16> %14, <vscale x 8 x i16> %19,
+                                                      <vscale x 8 x i16> %5, <vscale x 8 x i16> %10, <vscale x 8 x i16> %15, <vscale x 8 x i16> %20)
+  ret void
+}
 
 ; == Multi, single (unsigned) ==
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll
index 9e7d96c..b4a83c1 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qcvt.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py$
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; SQCVT
@@ -44,6 +44,85 @@ define <vscale x 8 x i16> @multi_vector_qcvt_x4_s16_s64(<vscale x 2 x i64> %unus
   ret <vscale x 8 x i16> %res
 }
 
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vector_qcvt_x4_s16_s64_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: multi_vector_qcvt_x4_s16_s64_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    add x9, x1, x0
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x1]
+; CHECK-NEXT:    ld1d { z17.d, z21.d, z25.d, z29.d }, pn8/z, [x9]
+; CHECK-NEXT:    add x10, x1, x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1d { z18.d, z22.d, z26.d, z30.d }, pn8/z, [x10]
+; CHECK-NEXT:    ld1d { z19.d, z23.d, z27.d, z31.d }, pn8/z, [x8]
+; CHECK-NEXT:    sqcvt z0.h, { z16.d - z19.d }
+; CHECK-NEXT:    sqcvt z1.h, { z20.d - z23.d }
+; CHECK-NEXT:    sqcvt z2.h, { z24.d - z27.d }
+; CHECK-NEXT:    sqcvt z3.h, { z28.d - z31.d }
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 0
+  %3 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 1
+  %4 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 2
+  %5 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %6, 0
+  %8 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %6, 1
+  %9 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %6, 2
+  %10 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %11, 0
+  %13 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %11, 1
+  %14 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %11, 2
+  %15 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %16, 0
+  %18 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %16, 1
+  %19 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %16, 2
+  %20 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %16, 3
+  %res1 = call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvt.x4.nxv2i64(<vscale x 2 x i64> %2, <vscale x 2 x i64> %7, <vscale x 2 x i64> %12, <vscale x 2 x i64> %17)
+  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvt.x4.nxv2i64(<vscale x 2 x i64> %3, <vscale x 2 x i64> %8, <vscale x 2 x i64> %13, <vscale x 2 x i64> %18)
+  %res3 = call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvt.x4.nxv2i64(<vscale x 2 x i64> %4, <vscale x 2 x i64> %9, <vscale x 2 x i64> %14, <vscale x 2 x i64> %19)
+  %res4 = call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvt.x4.nxv2i64(<vscale x 2 x i64> %5, <vscale x 2 x i64> %10, <vscale x 2 x i64> %15, <vscale x 2 x i64> %20)
+  %ins1 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> %res1, 0
+  %ins2 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %ins1, <vscale x 8 x i16> %res2, 1
+  %ins3 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %ins2, <vscale x 8 x i16> %res3, 2
+  %ins4 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %ins3, <vscale x 8 x i16> %res4, 3
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %ins4
+}
+
 ;
 ; UQCVT
 ;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qrshr.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qrshr.ll
index f007055..0bc9e15 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-qrshr.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-qrshr.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; S/UQRSHR x2
@@ -16,6 +16,46 @@ define <vscale x 8 x i16> @multi_vector_sat_shift_narrow_x2_s16(<vscale x 4 x i3
   ret <vscale x 8 x i16> %res
 }
 
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vector_sat_shift_narrow_x2_s16_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: multi_vector_sat_shift_narrow_x2_s16_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x8, x1, x0
+; CHECK-NEXT:    ld1w { z2.s, z10.s }, pn8/z, [x1]
+; CHECK-NEXT:    ld1w { z3.s, z11.s }, pn8/z, [x8]
+; CHECK-NEXT:    sqrshr z0.h, { z2.s, z3.s }, #16
+; CHECK-NEXT:    sqrshr z1.h, { z10.s, z11.s }, #16
+; CHECK-NEXT:    ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %1, 0
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %4, 0
+  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %4, 1
+  %res1 = call <vscale x 8 x i16> @llvm.aarch64.sve.sqrshr.x2.nxv8i16(<vscale x 4 x i32> %2, <vscale x 4 x i32> %5, i32 16)
+  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sve.sqrshr.x2.nxv8i16(<vscale x 4 x i32> %3, <vscale x 4 x i32> %6, i32 16)
+  %ins1 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> %res1, 0
+  %ins2 = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %ins1, <vscale x 8 x i16> %res2, 1
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %ins2
+}
+
 define <vscale x 8 x i16> @multi_vector_sat_shift_narrow_x2_u16(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
 ; CHECK-LABEL: multi_vector_sat_shift_narrow_x2_u16:
 ; CHECK:       // %bb.0:
@@ -44,6 +84,85 @@ define <vscale x 16 x i8> @multi_vector_sat_shift_narrow_x4_s8(<vscale x 4 x i32
   ret <vscale x 16 x i8> %res
 }
 
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vector_sat_shift_narrow_x4_s8_tuple(i64 %stride, ptr %ptr) {
+; CHECK-LABEL: multi_vector_sat_shift_narrow_x4_s8_tuple:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    add x9, x1, x0
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x1]
+; CHECK-NEXT:    ld1w { z17.s, z21.s, z25.s, z29.s }, pn8/z, [x9]
+; CHECK-NEXT:    add x10, x1, x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z18.s, z22.s, z26.s, z30.s }, pn8/z, [x10]
+; CHECK-NEXT:    ld1w { z19.s, z23.s, z27.s, z31.s }, pn8/z, [x8]
+; CHECK-NEXT:    sqrshr z0.b, { z16.s - z19.s }, #32
+; CHECK-NEXT:    sqrshr z1.b, { z20.s - z23.s }, #32
+; CHECK-NEXT:    sqrshr z2.b, { z24.s - z27.s }, #32
+; CHECK-NEXT:    sqrshr z3.b, { z28.s - z31.s }, #32
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %1, 0
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %1, 1
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %1, 2
+  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %6, 0
+  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %6, 1
+  %9 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %6, 2
+  %10 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %11, 0
+  %13 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %11, 1
+  %14 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %11, 2
+  %15 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %16, 0
+  %18 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %16, 1
+  %19 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %16, 2
+  %20 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %16, 3
+  %res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshr.x4.nxv16i8(<vscale x 4 x i32> %2, <vscale x 4 x i32> %7, <vscale x 4 x i32> %12, <vscale x 4 x i32> %17, i32 32)
+  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshr.x4.nxv16i8(<vscale x 4 x i32> %3, <vscale x 4 x i32> %8, <vscale x 4 x i32> %13, <vscale x 4 x i32> %18, i32 32)
+  %res3 = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshr.x4.nxv16i8(<vscale x 4 x i32> %4, <vscale x 4 x i32> %9, <vscale x 4 x i32> %14, <vscale x 4 x i32> %19, i32 32)
+  %res4 = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshr.x4.nxv16i8(<vscale x 4 x i32> %5, <vscale x 4 x i32> %10, <vscale x 4 x i32> %15, <vscale x 4 x i32> %20, i32 32)
+  %ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
+  %ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
+  %ins3 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2, <vscale x 16 x i8> %res3, 2
+  %ins4 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ins3, <vscale x 16 x i8> %res4, 3
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %ins4
+}
+
 define <vscale x 8 x i16> @multi_vector_sat_shift_narrow_x4_s16(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
 ; CHECK-LABEL: multi_vector_sat_shift_narrow_x4_s16:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir
new file mode 100644
index 0000000..23da26d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus-fake16.mir
@@ -0,0 +1,374 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=-real-true16 -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=GFX11 %s
+
+---
+name: fcmp_false_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_false_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(false), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_oeq_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_oeq_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_EQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(oeq), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ogt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ogt_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_GT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ogt), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_oge_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_oge_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(oge), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_olt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_olt_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_LT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(olt), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ole_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ole_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_LE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ole), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+---
+name: fcmp_one_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_one_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(one), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ord_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ord_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(one), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_uno_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_uno_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_U_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(uno), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ueq_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ueq_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_NLG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ueq), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ugt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ugt_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_NLE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ugt), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_uge_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_uge_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_NLT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(uge), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ult_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ult_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ult), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ule_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ule_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_NGT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ule), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_une_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_une_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_NEQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_fake16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(une), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_true_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_true_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(true), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir
new file mode 100644
index 0000000..a7140e6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.gfx11plus.mir
@@ -0,0 +1,402 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=+real-true16 -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=GFX11 %s
+
+---
+name: fcmp_false_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_false_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(false), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_oeq_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_oeq_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(oeq), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ogt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ogt_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_GT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ogt), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_oge_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_oge_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_GE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(oge), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_olt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_olt_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(olt), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ole_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ole_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_LE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ole), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+---
+name: fcmp_one_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_one_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_LG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(one), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ord_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ord_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_LG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(one), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_uno_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_uno_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_U_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(uno), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ueq_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ueq_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_NLG_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ueq), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ugt_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ugt_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_NLE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ugt), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_uge_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_uge_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_NLT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(uge), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ult_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ult_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_NGE_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ult), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_ule_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_ule_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_NGT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(ule), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_une_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_une_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+    ; GFX11-NEXT: [[V_CMP_NEQ_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_t16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(une), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
+---
+name: fcmp_true_s16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GFX11-LABEL: name: fcmp_true_s16_vv
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vcc(s1) = G_FCMP floatpred(true), %2, %3
+    S_ENDPGM 0, implicit %4
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
index 5c387baf..85b1d40 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcmp.s16.mir
@@ -1,7 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=GFX11 %s
 
 ---
 name: fcmp_false_s16_vv
@@ -31,15 +30,6 @@ body: |
     ; WAVE32-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]]
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
     ;
-    ; GFX11-LABEL: name: fcmp_false_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
-    ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(false), [[TRUNC]](s16), [[TRUNC1]]
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -72,13 +62,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_EQ_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_oeq_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_EQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_EQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -111,13 +94,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_GT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_ogt_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_GT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GT_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -150,13 +126,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_GE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_oge_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_GE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_GE_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -189,13 +158,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_LT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_olt_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_LT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LT_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -228,13 +190,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_LE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_ole_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_LE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LE_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -266,13 +221,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_one_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -305,13 +253,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_LG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_ord_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_LG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_LG_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -344,13 +285,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_U_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_uno_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_U_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_U_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -383,13 +317,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_NLG_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_ueq_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_NLG_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLG_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLG_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -422,13 +349,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_NLE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_ugt_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_NLE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLE_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -461,13 +381,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_NLT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_uge_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_NLT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NLT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NLT_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -500,13 +413,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_NGE_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_ult_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGE_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGE_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -539,13 +445,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_NGT_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_ule_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_NGT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NGT_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NGT_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -578,13 +477,6 @@ body: |
     ; WAVE32-NEXT: [[V_CMP_NEQ_F16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcmp_une_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GFX11-NEXT: [[V_CMP_NEQ_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_NEQ_F16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_NEQ_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -621,15 +513,6 @@ body: |
     ; WAVE32-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]]
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
     ;
-    ; GFX11-LABEL: name: fcmp_true_s16_vv
-    ; GFX11: liveins: $vgpr0, $vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
-    ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX11-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(true), [[TRUNC]](s16), [[TRUNC1]]
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[FCMP]](s1)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll
new file mode 100644
index 0000000..090aa06
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/promote-dependency-on-invariant-result.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true --stop-after=irtranslator -o - %s | FileCheck %s
+
+declare ptr @llvm.invariant.start.p5(i64 immarg, ptr addrspace(5) nocapture)
+declare void @llvm.invariant.end.p5(ptr, i64 immarg, ptr addrspace(5) nocapture)
+
+define void @use_invariant_promotable_lds(ptr addrspace(5) %arg, i32 %i) {
+  ; CHECK-LABEL: name: use_invariant_promotable_lds
+  ; CHECK: bb.1.bb:
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   G_STORE [[C]](s32), [[DEF]](p0) :: (store (s32) into %ir.tmp)
+  ; CHECK-NEXT:   SI_RETURN
+bb:
+  %tmp = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %arg)
+  call void @llvm.invariant.end.p5(ptr %tmp, i64 4, ptr addrspace(5) %arg)
+  store i32 0, ptr %tmp, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
index 0818f60..96775f47 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
@@ -8,6 +8,7 @@
 ; GCN-NEXT: amdpal.pipelines:
 ; GCN-NEXT:   - .hardware_stages:
 ; GCN-NEXT:       .cs:
+; GCN-NEXT:         .entry_point:    _amdgpu_cs
 ; GCN-NEXT:         .entry_point_symbol:    cs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
index e37d22c..1379246 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
@@ -7,6 +7,7 @@
 ; GCN-NEXT: amdpal.pipelines:
 ; GCN-NEXT:   - .hardware_stages:
 ; GCN-NEXT:       .es:
+; GCN-NEXT:         .entry_point:    _amdgpu_es
 ; GCN-NEXT:         .entry_point_symbol:    es_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
index d847f75..1fba34a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
@@ -8,6 +8,7 @@
 ; GCN-NEXT: amdpal.pipelines:
 ; GCN-NEXT:   - .hardware_stages:
 ; GCN-NEXT:       .gs:
+; GCN-NEXT:         .entry_point:    _amdgpu_gs
 ; GCN-NEXT:         .entry_point_symbol:    gs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
index 74f5f44..53c6b95 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
@@ -8,6 +8,7 @@
 ; GCN-NEXT: amdpal.pipelines:
 ; GCN-NEXT:   - .hardware_stages:
 ; GCN-NEXT:       .hs:
+; GCN-NEXT:         .entry_point:    _amdgpu_hs
 ; GCN-NEXT:         .entry_point_symbol:    hs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
index 287cc12..ebe7531 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
@@ -7,6 +7,7 @@
 ; GCN-NEXT: amdpal.pipelines:
 ; GCN-NEXT:   - .hardware_stages:
 ; GCN-NEXT:       .ls:
+; GCN-NEXT:         .entry_point:    _amdgpu_ls
 ; GCN-NEXT:         .entry_point_symbol:    ls_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
index e176718..32f19e2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
@@ -11,6 +11,7 @@
 ; GCN-NEXT: amdpal.pipelines:
 ; GCN-NEXT:   - .hardware_stages:
 ; GCN-NEXT:       .ps:
+; GCN-NEXT:         .entry_point:    _amdgpu_ps
 ; GCN-NEXT:         .entry_point_symbol:    amdpal_psenable
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
index b225d97..853d221 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
@@ -8,6 +8,7 @@
 ; GCN-NEXT: amdpal.pipelines:
 ; GCN-NEXT:   - .hardware_stages:
 ; GCN-NEXT:       .vs:
+; GCN-NEXT:         .entry_point:    _amdgpu_vs
 ; GCN-NEXT:         .entry_point_symbol:    vs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll
index 97fcf06..171df02 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal.ll
@@ -86,6 +86,7 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32,
 ; PAL-NEXT: amdpal.pipelines:
 ; PAL-NEXT:   - .hardware_stages:
 ; PAL-NEXT:       .cs:
+; PAL-NEXT:         .entry_point:    _amdgpu_cs
 ; PAL-NEXT:         .entry_point_symbol:    scratch2_cs
 ; PAL-NEXT:         .scratch_memory_size: 0x10
 ; PAL-NEXT:         .sgpr_count:     0x
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
new file mode 100644
index 0000000..2c6aabe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+; Make sure stack use isn't introduced for these bitcasts.
+
+define i160 @bitcast_v5i32_to_i160(<5 x i32> %vec) {
+; GFX9-LABEL: bitcast_v5i32_to_i160:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v5i32_to_i160:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <5 x i32> %vec to i160
+  ret i160 %bitcast
+}
+
+define i192 @bitcast_v6i32_to_i192(<6 x i32> %vec) {
+; GFX9-LABEL: bitcast_v6i32_to_i192:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v6i32_to_i192:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <6 x i32> %vec to i192
+  ret i192 %bitcast
+}
+
+define i224 @bitcast_v7i32_to_i224(<7 x i32> %vec) {
+; GFX9-LABEL: bitcast_v7i32_to_i224:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v7i32_to_i224:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <7 x i32> %vec to i224
+  ret i224 %bitcast
+}
+
+define i256 @bitcast_v8i32_to_i256(<8 x i32> %vec) {
+; GFX9-LABEL: bitcast_v8i32_to_i256:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v8i32_to_i256:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <8 x i32> %vec to i256
+  ret i256 %bitcast
+}
+
+define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) {
+; GFX9-LABEL: bitcast_i160_to_v5i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i160_to_v5i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i160 %int to <5 x i32>
+  ret <5 x i32> %bitcast
+}
+
+define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) {
+; GFX9-LABEL: bitcast_i192_to_v6i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i192_to_v6i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i192 %int to <6 x i32>
+  ret <6 x i32> %bitcast
+}
+
+define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) {
+; GFX9-LABEL: bitcast_i224_to_v7i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i224_to_v7i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i224 %int to <7 x i32>
+  ret <7 x i32> %bitcast
+}
+
+define <8 x i32> @bitcast_i256_to_v8i32(i256 %int) {
+; GFX9-LABEL: bitcast_i256_to_v8i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i256_to_v8i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i256 %int to <8 x i32>
+  ret <8 x i32> %bitcast
+}
+
+define i192 @bitcast_v3i64_to_i192(<3 x i64> %vec) {
+; GFX9-LABEL: bitcast_v3i64_to_i192:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v3i64_to_i192:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <3 x i64> %vec to i192
+  ret i192 %bitcast
+}
+
+define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) {
+; GFX9-LABEL: bitcast_i192_to_v3i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i192_to_v3i64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i192 %int to <3 x i64>
+  ret <3 x i64> %bitcast
+}
+
+define <10 x i16> @bitcast_i160_to_v10i16(i160 %int) {
+; GFX9-LABEL: bitcast_i160_to_v10i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i160_to_v10i16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX12-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i160 %int to <10 x i16>
+  ret <10 x i16> %bitcast
+}
+
+define i160 @bitcast_v10i16_to_i160(<10 x i16> %vec) {
+; GFX9-LABEL: bitcast_v10i16_to_i160:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v10i16_to_i160:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <10 x i16> %vec to i160
+  ret i160 %bitcast
+}
+
+define i12 @bitcast_v2i6_to_i12(<2 x i6> %vec) {
+; GFX9-LABEL: bitcast_v2i6_to_i12:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 6, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v2i6_to_i12:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b16 v1, 6, v1
+; GFX12-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <2 x i6> %vec to i12
+  ret i12 %bitcast
+}
+
+define <2 x i6> @bitcast_i12_to_v2i6(i12 %int) {
+; GFX9-LABEL: bitcast_i12_to_v2i6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 63, v0
+; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 6, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 63, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i12_to_v2i6:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b16 v1, 6, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_and_b32_e32 v1, 63, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i12 %int to <2 x i6>
+  ret <2 x i6> %bitcast
+}
+
+define i160 @bitcast_v5f32_to_i160(<5 x float> %vec) {
+; GFX9-LABEL: bitcast_v5f32_to_i160:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v5f32_to_i160:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <5 x float> %vec to i160
+  ret i160 %bitcast
+}
+
+define <5 x float> @bitcast_i160_to_v5f32(i160 %int) {
+; GFX9-LABEL: bitcast_i160_to_v5f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i160_to_v5f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i160 %int to <5 x float>
+  ret <5 x float> %bitcast
+}
+
+define <6 x float> @bitcast_i192_to_v6f32(i192 %int) {
+; GFX9-LABEL: bitcast_i192_to_v6f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_i192_to_v6f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast i192 %int to <6 x float>
+  ret <6 x float> %bitcast
+}
+
+define i192 @bitcast_v6f32_to_i192(<6 x float> %vec) {
+; GFX9-LABEL: bitcast_v6f32_to_i192:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: bitcast_v6f32_to_i192:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %bitcast = bitcast <6 x float> %vec to i192
+  ret i192 %bitcast
+}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 7eaa52d..405058b 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3091,15 +3091,6 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16
-; SDAG-NEXT:    s_mov_b32 s4, s33
-; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
-; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; SDAG-NEXT:    s_mov_b32 s5, s34
-; SDAG-NEXT:    s_mov_b32 s34, s32
-; SDAG-NEXT:    s_addk_i32 s32, 0x1800
-; SDAG-NEXT:    s_mov_b32 s32, s34
-; SDAG-NEXT:    s_mov_b32 s34, s5
-; SDAG-NEXT:    s_mov_b32 s33, s4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3119,17 +3110,8 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-LABEL: store_i160:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, s33
-; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
-; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
-; SDAG-NEXT:    s_mov_b32 s5, s34
-; SDAG-NEXT:    s_mov_b32 s34, s32
-; SDAG-NEXT:    s_addk_i32 s32, 0x1000
 ; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
-; SDAG-NEXT:    s_mov_b32 s32, s34
-; SDAG-NEXT:    s_mov_b32 s34, s5
-; SDAG-NEXT:    s_mov_b32 s33, s4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 17ab8fc..6bf126a 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -457,58 +457,27 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; EG-LABEL: v_ctpop_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 37, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
+; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T6.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T8.XY, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV T0.Y, T4.X,
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     AND_INT * T0.W, T8.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     LSHR * T0.W, T8.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T5.X,
-; EG-NEXT:     AND_INT * T0.W, T8.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     LSHR * T0.W, T8.Y, literal.x,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT:     BCNT_INT T0.Y, PV.W,
+; EG-NEXT:     AND_INT * T0.W, T0.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T8.Y, T1.W, PV.W,
+; EG-NEXT:     BCNT_INT T0.X, PV.W,
+; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.X, PV.Y,
-; EG-NEXT:     MOV * T8.X, T4.X,
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
   %val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16
@@ -601,94 +570,33 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
 ;
 ; EG-LABEL: v_ctpop_v8i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 73, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
+; EG-NEXT:    ALU 13, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV T0.Y, T4.X,
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     LSHR * T0.W, T12.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT * T0.W, PV.W,
-; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T12.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T5.X,
-; EG-NEXT:     LSHR * T0.W, T12.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T12.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
-; EG-NEXT:     MOV T5.X, PV.Y,
-; EG-NEXT:     MOV * T0.X, T8.X,
-; EG-NEXT:     LSHR * T0.W, T12.Z, literal.x,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     LSHR * T0.W, T0.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
 ; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T12.Z, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T9.X,
-; EG-NEXT:     LSHR * T0.W, T12.W, literal.x,
+; EG-NEXT:     BCNT_INT T0.Z, PS,
+; EG-NEXT:     LSHR * T1.W, T0.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
+; EG-NEXT:     BCNT_INT T0.Y, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T12.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     LSHR T12.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
+; EG-NEXT:     BCNT_INT T0.X, PV.W,
+; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T9.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T4.X,
-; EG-NEXT:     MOV * T0.Z, T8.X,
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid
   %val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32
@@ -837,174 +745,46 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
 ;
 ; EG-LABEL: v_ctpop_v16i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 3, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 114, @16, KC0[], KC1[]
-; EG-NEXT:    ALU 34, @131, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
+; EG-NEXT:    ALU 2, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 25, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T14.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T13.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 8:
-; EG-NEXT:     VTX_READ_128 T20.XYZW, T0.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T21.XYZW, T0.X, 0, #1
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     MOV T0.Y, T4.X,
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT:    ALU clause starting at 16:
-; EG-NEXT:     LSHR * T0.W, T20.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT * T0.W, PV.W,
-; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T20.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T5.X,
-; EG-NEXT:     LSHR * T0.W, T20.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T20.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
-; EG-NEXT:     MOV T5.X, PV.Y,
-; EG-NEXT:     MOV * T0.X, T8.X,
-; EG-NEXT:     LSHR * T0.W, T20.Z, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T20.Z, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T9.X,
-; EG-NEXT:     LSHR * T0.W, T20.W, literal.x,
+; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     LSHR * T0.W, T12.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT:     BCNT_INT T12.W, PV.W,
+; EG-NEXT:     AND_INT * T0.W, T12.Z, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT:     BCNT_INT T12.Z, PS,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     LSHR * T1.W, T12.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T0.W, T20.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     BCNT_INT T12.Y, PS,
+; EG-NEXT:     AND_INT T0.Z, T0.Z, literal.x,
 ; EG-NEXT:     BCNT_INT T0.W, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T9.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T12.X,
-; EG-NEXT:     LSHR * T1.W, T21.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT:     MOV * T12.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T1.W, T21.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
-; EG-NEXT:     MOV T12.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T13.X,
-; EG-NEXT:     LSHR * T1.W, T21.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT:     MOV * T13.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T1.W, T21.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T20.Y, PS, PV.W,
-; EG-NEXT:     MOV T13.X, PV.Y,
-; EG-NEXT:     MOV * T0.X, T16.X,
-; EG-NEXT:     LSHR * T1.W, T21.Z, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT:    ALU clause starting at 131:
-; EG-NEXT:     MOV * T16.X, T1.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT * T1.W, T21.Z, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
-; EG-NEXT:     MOV T16.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T17.X,
-; EG-NEXT:     LSHR * T1.W, T21.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T12.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT:     BCNT_INT T12.X, PS,
+; EG-NEXT:     BCNT_INT T0.Z, PV.Z,
+; EG-NEXT:     LSHR T1.W, T0.X, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT:     MOV * T17.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     AND_INT T1.W, T21.W, literal.x,
-; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
-; EG-NEXT:     AND_INT T0.Z, PV.X, literal.x,
-; EG-NEXT:     BCNT_INT T1.W, PV.W,
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
-; EG-NEXT:    -65536(nan), 16(2.242078e-44)
-; EG-NEXT:     LSHR T22.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T20.W, PV.Z, PV.W,
+; EG-NEXT:     LSHR T13.X, PS, literal.x,
+; EG-NEXT:     BCNT_INT T0.Y, PV.W,
+; EG-NEXT:     AND_INT * T1.W, T0.X, literal.y,
+; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT:     BCNT_INT T0.X, PV.W,
+; EG-NEXT:     LSHR * T14.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T17.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T4.X,
-; EG-NEXT:     MOV * T0.Z, T8.X,
-; EG-NEXT:     MOV T20.X, T12.X,
-; EG-NEXT:     MOV * T20.Z, T16.X, BS:VEC_120/SCL_212
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid
   %val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/dead_copy.mir b/llvm/test/CodeGen/AMDGPU/dead_copy.mir
index 2b54c61..5bc42e9 100644
--- a/llvm/test/CodeGen/AMDGPU/dead_copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead_copy.mir
@@ -1,4 +1,5 @@
 # RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji  -run-pass=machine-cp -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+# RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji  -passes=machine-cp | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL: dead_copy
 # GCN:       bb.0
diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
index e91bed4..b205678 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -66,6 +66,7 @@
 ; OSABI-PAL-ELF: amdpal.pipelines:
 ; OSABI-PAL-ELF:   - .hardware_stages:
 ; OSABI-PAL-ELF:       .cs:
+; OSABI-PAL-ELF:         .entry_point:    _amdgpu_cs
 ; OSABI-PAL-ELF:         .entry_point_symbol:    elf_notes
 ; OSABI-PAL-ELF:         .scratch_memory_size: 0
 ; OSABI-PAL-ELF:         .sgpr_count:     96
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 8704f4e..121891a 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1025,74 +1025,67 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
 ;
 ; EG-LABEL: v3i16_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
-; EG-NEXT:    TEX 2 @6
-; EG-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
-; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
+; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT:    MEM_RAT MSKOR T2.XW, T0.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
-; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 46, #3
-; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 48, #3
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     MOV * T5.X, 0.0,
-; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 44, #3
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 48, #3
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 11:
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
+; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
 ; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
 ; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
-; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.Y, 0.0,
-; EG-NEXT:     MOV * T5.Z, 0.0,
-; EG-NEXT:     LSHR T8.X, T0.W, literal.x,
-; EG-NEXT:     LSHL T0.W, T7.X, literal.y,
-; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHL T2.X, T2.W, PV.W,
+; EG-NEXT:     LSHL * T2.W, literal.x, PV.W,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT T6.X, PV.W, PS,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T2.Y, 0.0,
+; EG-NEXT:     MOV * T2.Z, 0.0,
+; EG-NEXT:     LSHR T0.X, T0.W, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v3i16_arg:
 ; CM:       ; %bb.0: ; %entry
 ; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
-; CM-NEXT:    TEX 2 @6
-; CM-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
+; CM-NEXT:    TEX 0 @8
+; CM-NEXT:    ALU 13, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT MSKOR T1.XW, T2.X
+; CM-NEXT:    ALU 1, @27, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 0 @10
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
 ; CM-NEXT:    CF_END
-; CM-NEXT:    Fetch clause starting at 6:
-; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
-; CM-NEXT:     VTX_READ_16 T7.X, T5.X, 46, #3
-; CM-NEXT:     VTX_READ_16 T5.X, T5.X, 48, #3
+; CM-NEXT:    Fetch clause starting at 8:
+; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 48, #3
+; CM-NEXT:    Fetch clause starting at 10:
+; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
 ; CM-NEXT:    ALU clause starting at 12:
-; CM-NEXT:     MOV * T5.X, 0.0,
+; CM-NEXT:     MOV * T0.X, 0.0,
 ; CM-NEXT:    ALU clause starting at 13:
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
 ; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
+; CM-NEXT:     AND_INT T0.Z, T1.X, literal.x,
 ; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
-; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
-; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
+; CM-NEXT:     LSHL T1.X, PV.Z, PV.W,
+; CM-NEXT:     LSHL * T1.W, literal.x, PV.W,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     MOV T5.Y, 0.0,
-; CM-NEXT:     MOV * T5.Z, 0.0,
-; CM-NEXT:     LSHL T0.Z, T7.X, literal.x,
-; CM-NEXT:     AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
-; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV T1.Y, 0.0,
+; CM-NEXT:     MOV * T1.Z, 0.0,
+; CM-NEXT:     LSHR * T2.X, T0.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T8.X, T0.W, literal.x,
+; CM-NEXT:    ALU clause starting at 27:
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   store <3 x i16> %in, ptr addrspace(1) %out, align 4
@@ -2676,205 +2669,47 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
 ;
 ; EG-LABEL: v8i16_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 1, @36, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @20
-; EG-NEXT:    ALU 5, @38, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @22
-; EG-NEXT:    ALU 5, @44, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @24
-; EG-NEXT:    ALU 5, @50, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @26
-; EG-NEXT:    ALU 5, @56, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @28
-; EG-NEXT:    ALU 5, @62, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @30
-; EG-NEXT:    ALU 5, @68, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @32
-; EG-NEXT:    ALU 5, @74, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @34
-; EG-NEXT:    ALU 8, @80, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
+; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
+; EG-NEXT:    TEX 3 @6
+; EG-NEXT:    ALU 4, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 20:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
-; EG-NEXT:    Fetch clause starting at 22:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
-; EG-NEXT:    Fetch clause starting at 24:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
-; EG-NEXT:    Fetch clause starting at 26:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
-; EG-NEXT:    Fetch clause starting at 28:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
-; EG-NEXT:    Fetch clause starting at 30:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
-; EG-NEXT:    Fetch clause starting at 32:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
-; EG-NEXT:    Fetch clause starting at 34:
-; EG-NEXT:     VTX_READ_16 T7.X, T7.X, 52, #3
-; EG-NEXT:    ALU clause starting at 36:
-; EG-NEXT:     MOV * T0.Y, T3.X,
-; EG-NEXT:     MOV * T7.X, 0.0,
-; EG-NEXT:    ALU clause starting at 38:
-; EG-NEXT:     LSHL T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T3.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T5.X,
-; EG-NEXT:    ALU clause starting at 44:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T3.X,
-; EG-NEXT:    ALU clause starting at 50:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T3.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T5.X,
-; EG-NEXT:    ALU clause starting at 56:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T2.X,
-; EG-NEXT:    ALU clause starting at 62:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T2.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:    ALU clause starting at 68:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T2.X,
-; EG-NEXT:    ALU clause starting at 74:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
-; EG-NEXT:     MOV T2.X, PV.Z,
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:    ALU clause starting at 80:
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
-; EG-NEXT:     AND_INT * T1.W, T7.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), -65536(nan)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T7.X, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.X,
-; EG-NEXT:     MOV * T7.W, T3.X,
-; EG-NEXT:     MOV * T7.Y, T5.X,
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 52, #3
+; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 54, #3
+; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 62, #3
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 60, #3
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:     MOV T1.Y, T2.X,
+; EG-NEXT:     MOV * T1.Z, T0.X, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, T3.X,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v8i16_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 1, @36, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @20
-; CM-NEXT:    ALU 5, @38, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @22
-; CM-NEXT:    ALU 5, @44, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @24
-; CM-NEXT:    ALU 5, @50, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @26
-; CM-NEXT:    ALU 5, @56, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @28
-; CM-NEXT:    ALU 5, @62, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @30
-; CM-NEXT:    ALU 5, @68, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @32
-; CM-NEXT:    ALU 5, @74, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @34
-; CM-NEXT:    ALU 8, @80, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
+; CM-NEXT:    ALU 0, @14, KC0[], KC1[]
+; CM-NEXT:    TEX 3 @6
+; CM-NEXT:    ALU 4, @15, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
-; CM-NEXT:    Fetch clause starting at 20:
-; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
-; CM-NEXT:    Fetch clause starting at 22:
-; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
-; CM-NEXT:    Fetch clause starting at 24:
-; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
-; CM-NEXT:    Fetch clause starting at 26:
-; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
-; CM-NEXT:    Fetch clause starting at 28:
-; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
-; CM-NEXT:    Fetch clause starting at 30:
-; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
-; CM-NEXT:    Fetch clause starting at 32:
-; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
-; CM-NEXT:    Fetch clause starting at 34:
-; CM-NEXT:     VTX_READ_16 T7.X, T7.X, 52, #3
-; CM-NEXT:    ALU clause starting at 36:
-; CM-NEXT:     MOV * T0.Y, T3.X,
-; CM-NEXT:     MOV * T7.X, 0.0,
-; CM-NEXT:    ALU clause starting at 38:
-; CM-NEXT:     LSHL T0.Z, T8.X, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
-; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
-; CM-NEXT:     MOV T3.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T5.X,
-; CM-NEXT:    ALU clause starting at 44:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T5.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T3.X,
-; CM-NEXT:    ALU clause starting at 50:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T3.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T5.X,
-; CM-NEXT:    ALU clause starting at 56:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T5.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T2.X,
-; CM-NEXT:    ALU clause starting at 62:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T2.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T4.X,
-; CM-NEXT:    ALU clause starting at 68:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T4.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T2.X,
-; CM-NEXT:    ALU clause starting at 74:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T7.Z, PV.Z, PV.W,
-; CM-NEXT:     MOV T2.X, PV.Z,
-; CM-NEXT:     MOV * T0.Y, T4.X,
-; CM-NEXT:    ALU clause starting at 80:
-; CM-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     AND_INT * T0.W, T7.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), -65536(nan)
-; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     OR_INT * T7.X, PV.Z, PV.W,
-; CM-NEXT:     MOV T4.X, PV.X,
-; CM-NEXT:     MOV * T7.W, T3.X,
-; CM-NEXT:     MOV * T7.Y, T5.X,
+; CM-NEXT:    Fetch clause starting at 6:
+; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 52, #3
+; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 54, #3
+; CM-NEXT:     VTX_READ_16 T3.X, T0.X, 62, #3
+; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 60, #3
+; CM-NEXT:    ALU clause starting at 14:
+; CM-NEXT:     MOV * T0.X, 0.0,
+; CM-NEXT:    ALU clause starting at 15:
+; CM-NEXT:     MOV T1.Y, T2.X,
+; CM-NEXT:     MOV * T1.Z, T0.X, BS:VEC_120/SCL_212
+; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T1.W, T3.X,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   store <8 x i16> %in, ptr addrspace(1) %out
   ret void
@@ -3618,392 +3453,68 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
 ;
 ; EG-LABEL: v16i16_arg:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 1, @68, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @36
-; EG-NEXT:    ALU 5, @70, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @38
-; EG-NEXT:    ALU 5, @76, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @40
-; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @42
-; EG-NEXT:    ALU 5, @88, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @44
-; EG-NEXT:    ALU 5, @94, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @46
-; EG-NEXT:    ALU 5, @100, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @48
-; EG-NEXT:    ALU 5, @106, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @50
-; EG-NEXT:    ALU 5, @112, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @52
-; EG-NEXT:    ALU 5, @118, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @54
-; EG-NEXT:    ALU 5, @124, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @56
-; EG-NEXT:    ALU 5, @130, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @58
-; EG-NEXT:    ALU 5, @136, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @60
-; EG-NEXT:    ALU 5, @142, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @62
-; EG-NEXT:    ALU 5, @148, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @64
-; EG-NEXT:    ALU 5, @154, KC0[], KC1[]
-; EG-NEXT:    TEX 0 @66
-; EG-NEXT:    ALU 13, @160, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
+; EG-NEXT:    ALU 0, @22, KC0[], KC1[]
+; EG-NEXT:    TEX 7 @6
+; EG-NEXT:    ALU 10, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    Fetch clause starting at 36:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
-; EG-NEXT:    Fetch clause starting at 38:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 90, #3
-; EG-NEXT:    Fetch clause starting at 40:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 82, #3
-; EG-NEXT:    Fetch clause starting at 42:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 74, #3
-; EG-NEXT:    Fetch clause starting at 44:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 96, #3
-; EG-NEXT:    Fetch clause starting at 46:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 88, #3
-; EG-NEXT:    Fetch clause starting at 48:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 80, #3
-; EG-NEXT:    Fetch clause starting at 50:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 72, #3
-; EG-NEXT:    Fetch clause starting at 52:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 94, #3
-; EG-NEXT:    Fetch clause starting at 54:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 86, #3
-; EG-NEXT:    Fetch clause starting at 56:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 78, #3
-; EG-NEXT:    Fetch clause starting at 58:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 70, #3
-; EG-NEXT:    Fetch clause starting at 60:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 92, #3
-; EG-NEXT:    Fetch clause starting at 62:
-; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 84, #3
-; EG-NEXT:    Fetch clause starting at 64:
-; EG-NEXT:     VTX_READ_16 T13.X, T11.X, 76, #3
-; EG-NEXT:    Fetch clause starting at 66:
-; EG-NEXT:     VTX_READ_16 T11.X, T11.X, 68, #3
-; EG-NEXT:    ALU clause starting at 68:
-; EG-NEXT:     MOV * T0.Y, T3.X,
-; EG-NEXT:     MOV * T11.X, 0.0,
-; EG-NEXT:    ALU clause starting at 70:
-; EG-NEXT:     LSHL T0.W, T12.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV T3.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T5.X,
-; EG-NEXT:    ALU clause starting at 76:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T7.X,
-; EG-NEXT:    ALU clause starting at 82:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T7.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T9.X,
-; EG-NEXT:    ALU clause starting at 88:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T9.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T3.X,
-; EG-NEXT:    ALU clause starting at 94:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T3.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T5.X,
-; EG-NEXT:    ALU clause starting at 100:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T7.X,
-; EG-NEXT:    ALU clause starting at 106:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T7.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T9.X,
-; EG-NEXT:    ALU clause starting at 112:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T9.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T2.X,
-; EG-NEXT:    ALU clause starting at 118:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T2.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:    ALU clause starting at 124:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T6.X,
-; EG-NEXT:    ALU clause starting at 130:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T6.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T8.X,
-; EG-NEXT:    ALU clause starting at 136:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T2.X,
-; EG-NEXT:    ALU clause starting at 142:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T12.Z, PV.W, PS,
-; EG-NEXT:     MOV T2.X, PV.Z,
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:    ALU clause starting at 148:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T12.X, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.X,
-; EG-NEXT:     MOV * T0.Y, T6.X,
-; EG-NEXT:    ALU clause starting at 154:
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T13.X, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T11.Z, PV.W, PS,
-; EG-NEXT:     MOV T6.X, PV.Z,
-; EG-NEXT:     MOV * T0.Y, T8.X,
-; EG-NEXT:    ALU clause starting at 160:
-; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 84, #3
+; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 86, #3
+; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 94, #3
+; EG-NEXT:     VTX_READ_16 T4.X, T0.X, 78, #3
+; EG-NEXT:     VTX_READ_16 T5.X, T0.X, 76, #3
+; EG-NEXT:     VTX_READ_16 T6.X, T0.X, 92, #3
+; EG-NEXT:     VTX_READ_16 T7.X, T0.X, 68, #3
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 70, #3
+; EG-NEXT:    ALU clause starting at 22:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 23:
+; EG-NEXT:     MOV T1.Y, T2.X,
+; EG-NEXT:     MOV * T7.Y, T0.X,
+; EG-NEXT:     MOV * T1.Z, T6.X,
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T7.Z, T5.X,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
-; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
-; EG-NEXT:     AND_INT * T1.W, T11.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), -65536(nan)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T11.X, PV.W, PS,
-; EG-NEXT:     MOV T8.X, PV.X,
-; EG-NEXT:     MOV * T12.W, T3.X,
-; EG-NEXT:     MOV T12.Y, T5.X,
-; EG-NEXT:     MOV T11.W, T7.X, BS:VEC_120/SCL_212
-; EG-NEXT:     MOV * T11.Y, T9.X,
+; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     MOV T7.W, T4.X,
+; EG-NEXT:     MOV * T1.W, T3.X,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: v16i16_arg:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 1, @68, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @36
-; CM-NEXT:    ALU 5, @70, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @38
-; CM-NEXT:    ALU 5, @76, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @40
-; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @42
-; CM-NEXT:    ALU 5, @88, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @44
-; CM-NEXT:    ALU 5, @94, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @46
-; CM-NEXT:    ALU 5, @100, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @48
-; CM-NEXT:    ALU 5, @106, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @50
-; CM-NEXT:    ALU 5, @112, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @52
-; CM-NEXT:    ALU 5, @118, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @54
-; CM-NEXT:    ALU 5, @124, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @56
-; CM-NEXT:    ALU 5, @130, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @58
-; CM-NEXT:    ALU 5, @136, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @60
-; CM-NEXT:    ALU 5, @142, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @62
-; CM-NEXT:    ALU 5, @148, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @64
-; CM-NEXT:    ALU 5, @154, KC0[], KC1[]
-; CM-NEXT:    TEX 0 @66
-; CM-NEXT:    ALU 14, @160, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
+; CM-NEXT:    ALU 0, @22, KC0[], KC1[]
+; CM-NEXT:    TEX 7 @6
+; CM-NEXT:    ALU 11, @23, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T2.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
 ; CM-NEXT:    CF_END
-; CM-NEXT:    Fetch clause starting at 36:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
-; CM-NEXT:    Fetch clause starting at 38:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 90, #3
-; CM-NEXT:    Fetch clause starting at 40:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 82, #3
-; CM-NEXT:    Fetch clause starting at 42:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 74, #3
-; CM-NEXT:    Fetch clause starting at 44:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 96, #3
-; CM-NEXT:    Fetch clause starting at 46:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 88, #3
-; CM-NEXT:    Fetch clause starting at 48:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 80, #3
-; CM-NEXT:    Fetch clause starting at 50:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 72, #3
-; CM-NEXT:    Fetch clause starting at 52:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 94, #3
-; CM-NEXT:    Fetch clause starting at 54:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 86, #3
-; CM-NEXT:    Fetch clause starting at 56:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 78, #3
-; CM-NEXT:    Fetch clause starting at 58:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 70, #3
-; CM-NEXT:    Fetch clause starting at 60:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 92, #3
-; CM-NEXT:    Fetch clause starting at 62:
-; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 84, #3
-; CM-NEXT:    Fetch clause starting at 64:
-; CM-NEXT:     VTX_READ_16 T13.X, T11.X, 76, #3
-; CM-NEXT:    Fetch clause starting at 66:
-; CM-NEXT:     VTX_READ_16 T11.X, T11.X, 68, #3
-; CM-NEXT:    ALU clause starting at 68:
-; CM-NEXT:     MOV * T0.Y, T3.X,
-; CM-NEXT:     MOV * T11.X, 0.0,
-; CM-NEXT:    ALU clause starting at 70:
-; CM-NEXT:     LSHL T0.Z, T12.X, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
-; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
-; CM-NEXT:     MOV T3.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T5.X,
-; CM-NEXT:    ALU clause starting at 76:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T5.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T7.X,
-; CM-NEXT:    ALU clause starting at 82:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T7.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T9.X,
-; CM-NEXT:    ALU clause starting at 88:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T9.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T3.X,
-; CM-NEXT:    ALU clause starting at 94:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T3.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T5.X,
-; CM-NEXT:    ALU clause starting at 100:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T5.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T7.X,
-; CM-NEXT:    ALU clause starting at 106:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T7.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T9.X,
-; CM-NEXT:    ALU clause starting at 112:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T9.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T2.X,
-; CM-NEXT:    ALU clause starting at 118:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T2.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T4.X,
-; CM-NEXT:    ALU clause starting at 124:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T4.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T6.X,
-; CM-NEXT:    ALU clause starting at 130:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T6.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T8.X,
-; CM-NEXT:    ALU clause starting at 136:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
-; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
-; CM-NEXT:     MOV T8.X, PV.W,
-; CM-NEXT:     MOV * T0.Y, T2.X,
-; CM-NEXT:    ALU clause starting at 142:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T12.Z, PV.Z, PV.W,
-; CM-NEXT:     MOV T2.X, PV.Z,
-; CM-NEXT:     MOV * T0.Y, T4.X,
-; CM-NEXT:    ALU clause starting at 148:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T12.X, PV.Z, PV.W,
-; CM-NEXT:     MOV T4.X, PV.X,
-; CM-NEXT:     MOV * T0.Y, T6.X,
-; CM-NEXT:    ALU clause starting at 154:
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
-; CM-NEXT:     AND_INT * T0.W, T13.X, literal.y,
-; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T11.Z, PV.Z, PV.W,
-; CM-NEXT:     MOV T6.X, PV.Z,
-; CM-NEXT:     MOV * T0.Y, T8.X,
-; CM-NEXT:    ALU clause starting at 160:
+; CM-NEXT:    Fetch clause starting at 6:
+; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 84, #3
+; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 86, #3
+; CM-NEXT:     VTX_READ_16 T3.X, T0.X, 78, #3
+; CM-NEXT:     VTX_READ_16 T4.X, T0.X, 94, #3
+; CM-NEXT:     VTX_READ_16 T5.X, T0.X, 76, #3
+; CM-NEXT:     VTX_READ_16 T6.X, T0.X, 92, #3
+; CM-NEXT:     VTX_READ_16 T7.X, T0.X, 68, #3
+; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 70, #3
+; CM-NEXT:    ALU clause starting at 22:
+; CM-NEXT:     MOV * T0.X, 0.0,
+; CM-NEXT:    ALU clause starting at 23:
+; CM-NEXT:     MOV * T1.Y, T2.X,
+; CM-NEXT:     MOV T7.Y, T0.X,
+; CM-NEXT:     MOV T1.Z, T6.X, BS:VEC_120/SCL_212
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T13.X, PV.W, literal.x,
+; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
+; CM-NEXT:     MOV T7.Z, T5.X,
+; CM-NEXT:     MOV * T1.W, T4.X, BS:VEC_120/SCL_212
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV * T7.W, T3.X,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
-; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     AND_INT * T0.W, T11.X, literal.z,
-; CM-NEXT:    2(2.802597e-45), -65536(nan)
-; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     OR_INT * T11.X, PV.Z, PV.W,
-; CM-NEXT:     MOV T8.X, PV.X,
-; CM-NEXT:     MOV * T12.W, T3.X,
-; CM-NEXT:     MOV T12.Y, T5.X,
-; CM-NEXT:     MOV * T11.W, T7.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MOV * T11.Y, T9.X,
 entry:
   store <16 x i16> %in, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 2afac4e..458afa4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -212,38 +212,32 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; EG-LABEL: constant_load_v3i16:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 2 @6
-; EG-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
-; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
+; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT:    MEM_RAT MSKOR T2.XW, T0.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 0, #1
-; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 2, #1
-; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 4, #1
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     MOV * T5.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 11:
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
+; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
 ; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
 ; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
-; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.Y, 0.0,
-; EG-NEXT:     MOV * T5.Z, 0.0,
-; EG-NEXT:     LSHR T8.X, T0.W, literal.x,
-; EG-NEXT:     LSHL T0.W, T7.X, literal.y,
-; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHL T2.X, T2.W, PV.W,
+; EG-NEXT:     LSHL * T2.W, literal.x, PV.W,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT T6.X, PV.W, PS,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T2.Y, 0.0,
+; EG-NEXT:     MOV * T2.Z, 0.0,
+; EG-NEXT:     LSHR T0.X, T0.W, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; GFX12-LABEL: constant_load_v3i16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b945c7c..c608bef3 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -9491,50 +9491,24 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ;
 ; EG-LABEL: constant_zextload_v4i8_to_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 31, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
+; EG-NEXT:    ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:     MOV * T7.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 10:
-; EG-NEXT:     AND_INT T0.W, T7.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    255(3.573311e-43), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T0.W, T7.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T5.X,
+; EG-NEXT:     MOV * T4.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T0.W, T7.X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT:     BFE_UINT * T4.Y, T4.X, literal.x, PV.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T8.Y, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.X, PV.Y,
-; EG-NEXT:     MOV * T8.X, T4.X,
+; EG-NEXT:     AND_INT T4.X, T4.X, literal.x,
+; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
 ;
 ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16:
 ; GFX12:       ; %bb.0:
@@ -9633,56 +9607,23 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ;
 ; EG-LABEL: constant_sextload_v4i8_to_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 37, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
+; EG-NEXT:    ALU 5, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
+; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:     MOV * T7.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 10:
-; EG-NEXT:     BFE_INT * T0.W, T7.X, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
+; EG-NEXT:     MOV * T4.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     BFE_INT T5.X, T4.X, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.W, T4.X, literal.x,
+; EG-NEXT:     LSHR * T4.X, KC0[2].Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT:     BFE_INT * T5.Y, PV.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T5.X,
-; EG-NEXT:     LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T7.X, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T8.Y, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.X, PV.Y,
-; EG-NEXT:     MOV * T8.X, T4.X,
 ;
 ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
 ; GFX12:       ; %bb.0:
@@ -9800,80 +9741,27 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ;
 ; EG-LABEL: constant_zextload_v8i8_to_v8i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 61, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
+; EG-NEXT:    ALU 9, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T11.XY, T11.X, 0, #1
+; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV * T0.Y, T8.X,
-; EG-NEXT:     MOV * T11.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 10:
-; EG-NEXT:     AND_INT T0.W, T11.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    255(3.573311e-43), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T0.W, T11.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T9.X,
+; EG-NEXT:     MOV * T5.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, T11.X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), -65536(nan)
-; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T11.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T12.Y, PV.W, PS,
-; EG-NEXT:     MOV T9.X, PV.Y,
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T11.Y, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T11.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T5.X,
-; EG-NEXT:     BFE_UINT * T0.W, T11.Y, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T11.Y, literal.x,
+; EG-NEXT:     BFE_UINT * T6.W, T5.Y, literal.x, PV.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T12.W, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T12.X, T8.X,
-; EG-NEXT:     MOV * T12.Z, T4.X,
+; EG-NEXT:     BFE_UINT T6.Y, T5.X, literal.x, T0.W,
+; EG-NEXT:     AND_INT * T6.Z, T5.Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 255(3.573311e-43)
+; EG-NEXT:     AND_INT T6.X, T5.X, literal.x,
+; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
+; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
 ;
 ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
 ; GFX12:       ; %bb.0:
@@ -10017,93 +9905,28 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ;
 ; EG-LABEL: constant_sextload_v8i8_to_v8i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 74, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
+; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_64 T11.XY, T11.X, 0, #1
+; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV * T0.Y, T8.X,
-; EG-NEXT:     MOV * T11.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 10:
-; EG-NEXT:     BFE_INT * T0.W, T11.X, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T11.X, literal.x,
+; EG-NEXT:     MOV * T5.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T9.X,
-; EG-NEXT:     LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T11.X, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T12.Y, PV.W, PS,
-; EG-NEXT:     MOV T9.X, PV.Y,
-; EG-NEXT:     MOV T0.Y, T4.X,
-; EG-NEXT:     BFE_INT * T0.W, T11.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T6.X, T5.X, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T5.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T11.Y, literal.x,
+; EG-NEXT:     BFE_INT T6.W, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T5.X,
-; EG-NEXT:     LSHR * T0.W, T11.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T11.Y, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T12.W, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T12.X, T8.X,
-; EG-NEXT:     MOV * T12.Z, T4.X,
+; EG-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT * T6.Y, PS, 0.0, literal.y,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
 ; GFX12:       ; %bb.0:
@@ -10296,146 +10119,37 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ;
 ; EG-LABEL: constant_zextload_v16i8_to_v16i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 103, @12, KC0[], KC1[]
-; EG-NEXT:    ALU 20, @116, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 19, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 8:
-; EG-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 0, #1
-; EG-NEXT:    ALU clause starting at 10:
-; EG-NEXT:     MOV * T0.Y, T16.X,
-; EG-NEXT:     MOV * T19.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     AND_INT T0.W, T19.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    255(3.573311e-43), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T16.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T0.W, T19.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T16.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T17.X,
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T7.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, T19.X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), -65536(nan)
-; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
-; EG-NEXT:     MOV * T17.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T19.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T20.Y, PV.W, PS,
-; EG-NEXT:     MOV T17.X, PV.Y,
-; EG-NEXT:     MOV * T0.Y, T12.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T19.Y, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T12.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T19.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T12.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T13.X,
-; EG-NEXT:     BFE_UINT * T1.W, T19.Y, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT:     MOV * T13.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T19.Y, literal.x,
+; EG-NEXT:     BFE_UINT * T8.W, T7.Y, literal.x, PV.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T20.W, PV.W, PS,
-; EG-NEXT:     MOV T13.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T8.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T19.Z, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T19.Z, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T9.X,
-; EG-NEXT:     BFE_UINT * T1.W, T19.Z, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T19.Z, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T19.Y, PV.W, PS,
-; EG-NEXT:     MOV T9.X, PV.Y,
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T19.W, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T19.W, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T5.X,
-; EG-NEXT:     BFE_UINT * T0.W, T19.W, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 116:
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR T0.W, T19.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T21.X, PS, literal.x,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.y,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    16711680(2.341805e-38), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T19.W, PV.W, PS,
+; EG-NEXT:     BFE_UINT T8.Y, T7.X, literal.x, T0.W,
+; EG-NEXT:     AND_INT T8.Z, T7.Y, literal.y,
+; EG-NEXT:     BFE_UINT * T9.W, T7.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 255(3.573311e-43)
+; EG-NEXT:     AND_INT T8.X, T7.X, literal.x,
+; EG-NEXT:     BFE_UINT T9.Y, T7.Z, literal.y, T0.W,
+; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT * T9.Z, T7.W, literal.x,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T9.X, T7.Z, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT:     LSHR * T10.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T20.X, T16.X,
-; EG-NEXT:     MOV * T20.Z, T12.X,
-; EG-NEXT:     MOV T19.X, T8.X,
-; EG-NEXT:     MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
 ;
 ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
 ; GFX12:       ; %bb.0:
@@ -10683,173 +10397,38 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ;
 ; EG-LABEL: constant_sextload_v16i8_to_v16i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 0 @8
-; EG-NEXT:    ALU 104, @12, KC0[], KC1[]
-; EG-NEXT:    ALU 46, @117, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 20, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 8:
-; EG-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 0, #1
-; EG-NEXT:    ALU clause starting at 10:
-; EG-NEXT:     MOV * T0.Y, T16.X,
-; EG-NEXT:     MOV * T19.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     BFE_INT * T0.W, T19.X, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T16.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T19.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T16.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T17.X,
-; EG-NEXT:     LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T17.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T19.X, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T20.Y, PV.W, PS,
-; EG-NEXT:     MOV T17.X, PV.Y,
-; EG-NEXT:     MOV T0.Y, T12.X,
-; EG-NEXT:     BFE_INT * T0.W, T19.Y, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T12.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T19.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T12.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T13.X,
-; EG-NEXT:     LSHR * T0.W, T19.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T13.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T19.Y, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T20.W, PV.W, PS,
-; EG-NEXT:     MOV T13.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T8.X,
-; EG-NEXT:     BFE_INT * T0.W, T19.Z, 0.0, literal.x,
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T7.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     BFE_INT * T8.Z, T7.Y, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T19.Z, literal.x,
+; EG-NEXT:     BFE_INT T8.X, T7.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T9.Z, T7.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T7.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T9.X,
-; EG-NEXT:     LSHR * T0.W, T19.Z, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T19.Z, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:    ALU clause starting at 117:
-; EG-NEXT:     OR_INT * T19.Y, T1.W, T0.W,
-; EG-NEXT:     MOV T9.X, PV.Y,
-; EG-NEXT:     MOV T0.Y, T4.X,
-; EG-NEXT:     BFE_INT * T0.W, T19.W, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T19.W, literal.x,
+; EG-NEXT:     BFE_INT T9.X, T7.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.Z, T7.W, literal.x,
+; EG-NEXT:     BFE_INT T8.W, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T5.X,
-; EG-NEXT:     LSHR * T0.W, T19.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR T0.W, T19.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    24(3.363116e-44), 16(2.242078e-44)
-; EG-NEXT:     LSHR T21.X, PS, literal.x,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.y,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
+; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     BFE_INT T8.Y, PS, 0.0, literal.y,
+; EG-NEXT:     LSHR T1.Z, T7.Z, literal.y,
+; EG-NEXT:     BFE_INT T9.W, PV.Z, 0.0, literal.y,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T22.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T19.W, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T20.X, T16.X,
-; EG-NEXT:     MOV * T20.Z, T12.X,
-; EG-NEXT:     MOV T19.X, T8.X,
-; EG-NEXT:     MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T10.X, PS, literal.x,
+; EG-NEXT:     BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16:
 ; GFX12:       ; %bb.0:
@@ -11194,276 +10773,58 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; EG-LABEL: constant_zextload_v32i8_to_v32i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @10
-; EG-NEXT:    ALU 103, @16, KC0[], KC1[]
-; EG-NEXT:    ALU 104, @120, KC0[], KC1[]
-; EG-NEXT:    ALU 41, @225, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @8
+; EG-NEXT:    ALU 37, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T12.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    Fetch clause starting at 10:
-; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T35.XYZW, T35.X, 0, #1
-; EG-NEXT:    ALU clause starting at 14:
-; EG-NEXT:     MOV * T0.Y, T16.X,
-; EG-NEXT:     MOV * T35.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 16:
-; EG-NEXT:     AND_INT T0.W, T37.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    255(3.573311e-43), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T16.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T0.W, T37.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T16.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T17.X,
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     MOV * T11.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 13:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, T37.X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), -65536(nan)
-; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
-; EG-NEXT:     MOV * T17.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T37.X, literal.x,
+; EG-NEXT:     BFE_UINT * T13.W, T11.Y, literal.x, PV.W,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T36.Y, PV.W, PS,
-; EG-NEXT:     MOV T17.X, PV.Y,
-; EG-NEXT:     MOV * T0.Y, T12.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T37.Y, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T12.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T37.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T12.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T13.X,
-; EG-NEXT:     BFE_UINT * T1.W, T37.Y, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT:     MOV * T13.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T37.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T36.W, PV.W, PS,
-; EG-NEXT:     MOV T13.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T8.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T37.Z, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T37.Z, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T9.X,
-; EG-NEXT:     BFE_UINT * T1.W, T37.Z, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T37.Z, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T37.Y, PV.W, PS,
-; EG-NEXT:     MOV T9.X, PV.Y,
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T37.W, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T37.W, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T5.X,
-; EG-NEXT:     BFE_UINT * T1.W, T37.W, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 120:
-; EG-NEXT:     AND_INT * T2.W, T0.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T37.W, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T37.W, PV.W, PS,
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T32.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T35.X, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T32.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T35.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T32.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T33.X,
-; EG-NEXT:     BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT:     MOV * T33.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T35.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T38.Y, PV.W, PS,
-; EG-NEXT:     MOV T33.X, PV.Y,
-; EG-NEXT:     MOV * T0.Y, T28.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T35.Y, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T28.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T35.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T28.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T29.X,
-; EG-NEXT:     BFE_UINT * T1.W, T35.Y, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT:     MOV * T29.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T35.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T38.W, PV.W, PS,
-; EG-NEXT:     MOV T29.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T24.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T35.Z, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T24.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHL * T1.W, T35.Z, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T24.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T25.X,
-; EG-NEXT:     BFE_UINT * T1.W, T35.Z, literal.x, T0.W,
+; EG-NEXT:     BFE_UINT T13.Y, T11.X, literal.x, T0.W,
+; EG-NEXT:     AND_INT T13.Z, T11.Y, literal.y,
+; EG-NEXT:     BFE_UINT * T14.W, T11.W, literal.x, T0.W,
+; EG-NEXT:    8(1.121039e-44), 255(3.573311e-43)
+; EG-NEXT:     AND_INT T13.X, T11.X, literal.x,
+; EG-NEXT:     BFE_UINT T14.Y, T11.Z, literal.y, T0.W,
+; EG-NEXT:     LSHR * T11.X, KC0[2].Y, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T14.Z, T11.W, literal.x,
+; EG-NEXT:     BFE_UINT * T15.W, T12.Y, literal.y, T0.W,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT:     AND_INT T14.X, T11.Z, literal.x,
+; EG-NEXT:     BFE_UINT T15.Y, T12.X, literal.y, T0.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT:    255(3.573311e-43), 8(1.121039e-44)
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, PV.W, T1.W,
-; EG-NEXT:     MOV * T25.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T1.W, T35.Z, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T35.Y, PV.W, PS,
-; EG-NEXT:     MOV T25.X, PV.Y,
-; EG-NEXT:     MOV * T0.Y, T20.X,
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T35.W, literal.y,
-; EG-NEXT:    -65536(nan), 255(3.573311e-43)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV * T20.X, PV.W,
-; EG-NEXT:    ALU clause starting at 225:
-; EG-NEXT:     MOV T0.Y, T20.X,
-; EG-NEXT:     LSHL * T1.W, T35.W, literal.x,
+; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
+; EG-NEXT:     AND_INT T15.Z, T12.Y, literal.y,
+; EG-NEXT:     BFE_UINT T17.W, T12.W, literal.z, T0.W,
+; EG-NEXT:     AND_INT * T15.X, T12.X, literal.y,
+; EG-NEXT:    2(2.802597e-45), 255(3.573311e-43)
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16711680(2.341805e-38)
-; EG-NEXT:     OR_INT * T1.W, PV.W, PS,
-; EG-NEXT:     MOV T20.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T21.X,
-; EG-NEXT:     BFE_UINT * T0.W, T35.W, literal.x, T0.W,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PV.W, T0.W,
-; EG-NEXT:     MOV * T21.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
+; EG-NEXT:     BFE_UINT T17.Y, T12.Z, literal.x, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
+; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
+; EG-NEXT:     AND_INT T17.Z, T12.W, literal.y,
+; EG-NEXT:     AND_INT * T17.X, T12.Z, literal.y,
+; EG-NEXT:    2(2.802597e-45), 255(3.573311e-43)
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T40.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.W, T35.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T41.X, PS, literal.x,
-; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT:     AND_INT T0.W, PV.W, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    16711680(2.341805e-38), 32(4.484155e-44)
-; EG-NEXT:     LSHR T42.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T35.W, PV.Z, PV.W,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T18.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T21.X, PV.W,
-; EG-NEXT:     MOV * T36.X, T16.X,
-; EG-NEXT:     MOV * T36.Z, T12.X,
-; EG-NEXT:     MOV T37.X, T8.X,
-; EG-NEXT:     MOV T37.Z, T4.X, BS:VEC_120/SCL_212
-; EG-NEXT:     MOV * T38.X, T32.X,
-; EG-NEXT:     MOV * T38.Z, T28.X,
-; EG-NEXT:     MOV T35.X, T24.X,
-; EG-NEXT:     MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
 ;
 ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16:
 ; GFX12:       ; %bb.0:
@@ -11919,331 +11280,60 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; EG-LABEL: constant_sextload_v32i8_to_v32i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @14, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @10
-; EG-NEXT:    ALU 104, @16, KC0[], KC1[]
-; EG-NEXT:    ALU 104, @121, KC0[], KC1[]
-; EG-NEXT:    ALU 95, @226, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
+; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @8
+; EG-NEXT:    ALU 39, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1
 ; EG-NEXT:    CF_END
-; EG-NEXT:    Fetch clause starting at 10:
-; EG-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 16, #1
-; EG-NEXT:     VTX_READ_128 T35.XYZW, T35.X, 0, #1
-; EG-NEXT:    ALU clause starting at 14:
-; EG-NEXT:     MOV * T0.Y, T16.X,
-; EG-NEXT:     MOV * T35.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 16:
-; EG-NEXT:     BFE_INT * T0.W, T37.X, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T16.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T37.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T16.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T17.X,
-; EG-NEXT:     LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T17.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T37.X, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T36.Y, PV.W, PS,
-; EG-NEXT:     MOV T17.X, PV.Y,
-; EG-NEXT:     MOV T0.Y, T12.X,
-; EG-NEXT:     BFE_INT * T0.W, T37.Y, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T12.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T37.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T12.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T13.X,
-; EG-NEXT:     LSHR * T0.W, T37.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T13.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T37.Y, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T36.W, PV.W, PS,
-; EG-NEXT:     MOV T13.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T8.X,
-; EG-NEXT:     BFE_INT * T0.W, T37.Z, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T37.Z, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T8.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T9.X,
-; EG-NEXT:     LSHR * T0.W, T37.Z, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T9.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T37.Z, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:    ALU clause starting at 121:
-; EG-NEXT:     OR_INT * T37.Y, T1.W, T0.W,
-; EG-NEXT:     MOV T9.X, PV.Y,
-; EG-NEXT:     MOV T0.Y, T4.X,
-; EG-NEXT:     BFE_INT * T0.W, T37.W, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T37.W, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T4.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T5.X,
-; EG-NEXT:     LSHR * T0.W, T37.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T37.W, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T37.W, PV.W, PS,
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T32.X,
-; EG-NEXT:     BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T32.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T35.X, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T32.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T33.X,
-; EG-NEXT:     LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T33.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T35.X, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T38.Y, PV.W, PS,
-; EG-NEXT:     MOV T33.X, PV.Y,
-; EG-NEXT:     MOV T0.Y, T28.X,
-; EG-NEXT:     BFE_INT * T0.W, T35.Y, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T28.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T35.Y, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T28.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T29.X,
-; EG-NEXT:     LSHR * T0.W, T35.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T29.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T35.Y, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 226:
-; EG-NEXT:     AND_INT T1.W, T0.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, T0.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T38.W, PV.W, PS,
-; EG-NEXT:     MOV T29.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T24.X,
-; EG-NEXT:     BFE_INT * T0.W, T35.Z, 0.0, literal.x,
-; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T24.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T35.Z, literal.x,
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     MOV * T11.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
+; EG-NEXT:     BFE_INT * T15.Z, T11.Y, 0.0, literal.y,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:     BFE_INT T15.X, T11.X, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.Y, T12.W, literal.x,
+; EG-NEXT:     BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR T0.W, T12.Y, literal.x,
+; EG-NEXT:     LSHR * T1.W, T11.Y, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T24.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T25.X,
-; EG-NEXT:     LSHR * T0.W, T35.Z, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T25.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ASHR * T0.W, T35.Z, literal.x,
-; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     OR_INT * T35.Y, PV.W, PS,
-; EG-NEXT:     MOV T25.X, PV.Y,
-; EG-NEXT:     MOV T0.Y, T20.X,
-; EG-NEXT:     BFE_INT * T0.W, T35.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T16.X, T11.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR T1.Y, T11.W, literal.x,
+; EG-NEXT:     BFE_INT T17.Z, T12.Y, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T15.W, PS, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.W, T11.X, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.W, PV.Y, literal.x,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
-; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV * T20.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T35.W, literal.x,
+; EG-NEXT:     BFE_INT T17.X, T12.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T15.Y, PS, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T18.Z, T12.W, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T16.W, PV.Y, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.W, T11.Z, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), 65535(9.183409e-41)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T20.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T21.X,
-; EG-NEXT:     LSHR * T0.W, T35.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.y,
-; EG-NEXT:    8(1.121039e-44), -65536(nan)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T21.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
-; EG-NEXT:     LSHR * T40.X, KC0[2].Y, literal.x,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     ASHR T0.W, T35.W, literal.x,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    24(3.363116e-44), 48(6.726233e-44)
-; EG-NEXT:     LSHR T41.X, PS, literal.x,
-; EG-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
-; EG-NEXT:     LSHL T0.W, PV.W, literal.z,
-; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.w,
-; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
-; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
-; EG-NEXT:     LSHR T42.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T35.W, PV.Z, PV.W,
-; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T21.X, PV.W,
-; EG-NEXT:     MOV * T36.X, T16.X,
-; EG-NEXT:     MOV * T36.Z, T12.X,
-; EG-NEXT:     MOV T37.X, T8.X,
-; EG-NEXT:     MOV T37.Z, T4.X, BS:VEC_120/SCL_212
-; EG-NEXT:     MOV * T38.X, T32.X,
-; EG-NEXT:     MOV * T38.Z, T28.X,
-; EG-NEXT:     MOV T35.X, T24.X,
-; EG-NEXT:     MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
+; EG-NEXT:     BFE_INT T18.X, T12.Z, 0.0, literal.x,
+; EG-NEXT:     BFE_INT T16.Y, PS, 0.0, literal.x,
+; EG-NEXT:     LSHR T0.Z, T12.X, literal.x,
+; EG-NEXT:     BFE_INT T17.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 32(4.484155e-44)
+; EG-NEXT:     LSHR T11.X, PS, literal.x,
+; EG-NEXT:     BFE_INT T17.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT:     LSHR T0.Z, T12.Z, literal.y,
+; EG-NEXT:     BFE_INT T18.W, T0.Y, 0.0, literal.y,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T12.X, PS, literal.x,
+; EG-NEXT:     BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
+; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ;
 ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16:
 ; GFX12:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 8589158..5733382 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -254,74 +254,63 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
 ;
 ; EG-LABEL: global_load_v3i16:
 ; EG:       ; %bb.0: ; %entry
-; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 2 @6
-; EG-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
-; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
+; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 14, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0
+; EG-NEXT:    MEM_RAT MSKOR T2.XW, T0.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 0, #1
-; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 2, #1
-; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 4, #1
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     MOV * T5.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 11:
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
+; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
 ; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
 ; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
-; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     MOV T5.Y, 0.0,
-; EG-NEXT:     MOV * T5.Z, 0.0,
-; EG-NEXT:     LSHR T8.X, T0.W, literal.x,
-; EG-NEXT:     LSHL T0.W, T7.X, literal.y,
-; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT:     LSHL T2.X, T2.W, PV.W,
+; EG-NEXT:     LSHL * T2.W, literal.x, PV.W,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT T6.X, PV.W, PS,
-; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV T2.Y, 0.0,
+; EG-NEXT:     MOV * T2.Z, 0.0,
+; EG-NEXT:     LSHR T0.X, T0.W, literal.x,
+; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CM-LABEL: global_load_v3i16:
 ; CM:       ; %bb.0: ; %entry
-; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    TEX 2 @6
-; CM-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
+; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 1 @6
+; CM-NEXT:    ALU 15, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    Fetch clause starting at 6:
-; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 0, #1
-; CM-NEXT:     VTX_READ_16 T7.X, T5.X, 2, #1
-; CM-NEXT:     VTX_READ_16 T5.X, T5.X, 4, #1
-; CM-NEXT:    ALU clause starting at 12:
-; CM-NEXT:     MOV * T5.X, KC0[2].Z,
-; CM-NEXT:    ALU clause starting at 13:
+; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 0, #1
+; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
+; CM-NEXT:    ALU clause starting at 10:
+; CM-NEXT:     MOV * T0.X, KC0[2].Z,
+; CM-NEXT:    ALU clause starting at 11:
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
 ; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
+; CM-NEXT:     AND_INT T0.Z, T0.X, literal.x,
 ; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
 ; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
-; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
-; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
+; CM-NEXT:     LSHL T2.X, PV.Z, PV.W,
+; CM-NEXT:     LSHL * T2.W, literal.x, PV.W,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; CM-NEXT:     MOV T5.Y, 0.0,
-; CM-NEXT:     MOV * T5.Z, 0.0,
-; CM-NEXT:     LSHL T0.Z, T7.X, literal.x,
-; CM-NEXT:     AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
-; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
-; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
+; CM-NEXT:     MOV T2.Y, 0.0,
+; CM-NEXT:     MOV * T2.Z, 0.0,
+; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T8.X, T0.W, literal.x,
+; CM-NEXT:     LSHR * T3.X, T0.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %ld = load <3 x i16>, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index fb34b5e..896e609 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -916,38 +916,22 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
+; EG: BFE_{{U?}}INT
 define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %load = load <32 x i8>, ptr addrspace(1) %in
   %ext = sext <32 x i8> %load to <32 x i16>
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index e8744c7..2b10d46 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -6,76 +6,37 @@
 define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addrspace(7) %out) {
 ; GFX12-LABEL: buffer_last_use_load_0:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x10
+; GFX12-NEXT:    s_mov_b32 s12, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s12
+; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
-; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
-; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT:    s_mov_b32 s6, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_mov_b32 s8, s1
+; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-NEXT:    s_mov_b32 s13, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX12-NEXT:    v_mov_b32_e32 v7, s6
-; GFX12-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v3, s1
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-NEXT:    ; implicit-def: $vgpr9
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s8
-; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
+; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-NEXT:    s_mov_b32 s13, s2
+; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT:    ; implicit-def: $vgpr8
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_3
-; GFX12-NEXT:  ; %bb.4:
+; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
@@ -86,77 +47,38 @@ entry:
 define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addrspace(7) %out) {
 ; GFX12-LABEL: buffer_last_use_load_1:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x10
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_mov_b32 s12, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s12
+; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
-; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX12-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX12-NEXT:    scratch_store_b128 off, v[1:4], off offset:32
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b64 v[6:7], off, off offset:40
-; GFX12-NEXT:    scratch_load_b32 v5, off, off offset:36
-; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
-; GFX12-NEXT:    scratch_store_b128 off, v[8:11], off
+; GFX12-NEXT:    s_mov_b32 s6, s3
+; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX12-NEXT:    s_mov_b32 s8, s1
+; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-NEXT:    s_mov_b32 s13, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b64 v[2:3], off, off offset:8
-; GFX12-NEXT:    scratch_load_b32 v1, off, off offset:4
-; GFX12-NEXT:    v_mov_b32_e32 v8, s6
-; GFX12-NEXT:    v_lshl_add_u32 v9, v0, 2, s0
+; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, s1
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v5
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v6
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[5:6]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v0, v9, s[4:7], null offen th:TH_LOAD_LU
-; GFX12-NEXT:    ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8
-; GFX12-NEXT:    ; implicit-def: $vgpr9
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v5, s8
-; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
+; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-NEXT:    s_mov_b32 s13, s2
+; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_store_b32 v0, v5, s[4:7], null offen
-; GFX12-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT:    ; implicit-def: $vgpr0
-; GFX12-NEXT:    ; implicit-def: $vgpr5
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB1_3
-; GFX12-NEXT:  ; %bb.4:
+; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -169,76 +91,37 @@ entry:
 define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
 ; GFX12-LABEL: buffer_last_use_and_volatile_load:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x10
+; GFX12-NEXT:    s_mov_b32 s12, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s12
+; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
-; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
-; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT:    s_mov_b32 s6, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_mov_b32 s8, s1
+; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-NEXT:    s_mov_b32 s13, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX12-NEXT:    v_mov_b32_e32 v7, s6
-; GFX12-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v3, s1
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-NEXT:    ; implicit-def: $vgpr9
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s8
-; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:  .LBB2_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
+; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-NEXT:    s_mov_b32 s13, s2
+; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT:    ; implicit-def: $vgpr8
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_3
-; GFX12-NEXT:  ; %bb.4:
+; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load volatile i32, ptr addrspace(7) %in, !amdgpu.last.use !{}
@@ -249,76 +132,37 @@ entry:
 define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) %in, ptr addrspace(7) %out) {
 ; GFX12-LABEL: buffer_last_use_and_nontemporal_load:
 ; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x10
+; GFX12-NEXT:    s_mov_b32 s12, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s12
+; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-NEXT:    scratch_load_b32 v4, off, off offset:36
-; GFX12-NEXT:    s_load_b32 s1, s[4:5], 0x30
-; GFX12-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-NEXT:    s_mov_b32 s6, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_mov_b32 s8, s1
+; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-NEXT:    s_mov_b32 s13, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX12-NEXT:    v_mov_b32_e32 v7, s6
-; GFX12-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v3, s1
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_LU
-; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-NEXT:    ; implicit-def: $vgpr9
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v4, s8
-; GFX12-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-NEXT:  .LBB3_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
+; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-NEXT:    s_mov_b32 s13, s2
+; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT:    ; implicit-def: $vgpr8
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_3
-; GFX12-NEXT:  ; %bb.4:
+; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
 ; GFX12-NEXT:    s_endpgm
 entry:
   %val = load i32, ptr addrspace(7) %in, !amdgpu.last.use !{}, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index a5f6c2f..a62910e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -13,30 +13,32 @@
 define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
 ; GFX9-SDAG-LABEL: buffer_nontemporal_load_store:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
 ; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
-; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s15
-; GFX9-SDAG-NEXT:    s_mov_b32 s15, s10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s14, s7
-; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s12, s5
-; GFX9-SDAG-NEXT:    s_or_b64 s[14:15], s[14:15], s[10:11]
-; GFX9-SDAG-NEXT:    s_mov_b32 s13, s6
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX9-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, s10
+; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
 ; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s10
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s7
-; GFX9-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s5
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s6
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX9-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, s10
+; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -72,68 +74,31 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX940-SDAG-LABEL: buffer_nontemporal_load_store:
 ; GFX940-SDAG:       ; %bb.0: ; %entry
 ; GFX940-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX940-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
-; GFX940-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX940-SDAG-NEXT:    s_load_dword s7, s[4:5], 0x30
+; GFX940-SDAG-NEXT:    s_load_dword s13, s[4:5], 0x10
+; GFX940-SDAG-NEXT:    s_mov_b32 s12, 0
+; GFX940-SDAG-NEXT:    s_mov_b32 s7, s12
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
-; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[10:11], off, off offset:40
-; GFX940-SDAG-NEXT:    scratch_load_dword v4, off, off offset:36
-; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
-; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[12:13], off, off offset:8
-; GFX940-SDAG-NEXT:    s_nop 0
-; GFX940-SDAG-NEXT:    scratch_load_dword v0, off, off offset:4
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v9, s0
-; GFX940-SDAG-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v5, v10
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
-; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, v12
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
-; GFX940-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
-; GFX940-SDAG-NEXT:    s_nop 0
-; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-SDAG-NEXT:    buffer_load_dword v8, v9, s[4:7], 0 offen nt
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr9
-; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX940-SDAG-NEXT:  ; %bb.2:
-; GFX940-SDAG-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v4, s8
-; GFX940-SDAG-NEXT:    s_mov_b64 s[0:1], exec
+; GFX940-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX940-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX940-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX940-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX940-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX940-SDAG-NEXT:    s_or_b64 s[8:9], s[2:3], s[12:13]
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-SDAG-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen nt
+; GFX940-SDAG-NEXT:    s_load_dword s13, s[4:5], 0x30
+; GFX940-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX940-SDAG-NEXT:    s_mov_b32 s5, s12
+; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX940-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX940-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX940-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX940-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX940-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-SDAG-NEXT:    s_nop 0
-; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-SDAG-NEXT:    buffer_store_dword v8, v4, s[4:7], 0 offen sc0 nt sc1
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr8
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4
-; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
-; GFX940-SDAG-NEXT:  ; %bb.4:
+; GFX940-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 nt sc1
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
 ; GFX940-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -169,31 +134,34 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX10-SDAG-LABEL: buffer_nontemporal_load_store:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_clause 0x1
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
 ; GFX10-SDAG-NEXT:    s_mov_b32 s10, 0
-; GFX10-SDAG-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_mov_b32 s13, s10
-; GFX10-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-SDAG-NEXT:    s_mov_b32 s12, s7
-; GFX10-SDAG-NEXT:    s_or_b64 s[14:15], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT:    s_mov_b32 s12, s5
-; GFX10-SDAG-NEXT:    s_mov_b32 s13, s6
-; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen slc
+; GFX10-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-SDAG-NEXT:    s_mov_b32 s12, s1
+; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
+; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
-; GFX10-SDAG-NEXT:    s_mov_b32 s9, s10
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-SDAG-NEXT:    s_mov_b32 s8, s7
-; GFX10-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GFX10-SDAG-NEXT:    s_mov_b32 s8, s5
-; GFX10-SDAG-NEXT:    s_mov_b32 s9, s6
+; GFX10-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
+; GFX10-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX10-SDAG-NEXT:    s_mov_b32 s3, s10
+; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
+; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -229,69 +197,37 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ;
 ; GFX11-SDAG-LABEL: buffer_nontemporal_load_store:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_clause 0x2
+; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
-; GFX11-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x10
+; GFX11-SDAG-NEXT:    s_mov_b32 s12, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_mov_b32 s7, s12
+; GFX11-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s8, s1
+; GFX11-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-SDAG-NEXT:    s_clause 0x1
-; GFX11-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
-; GFX11-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
-; GFX11-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
-; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
-; GFX11-SDAG-NEXT:    s_clause 0x1
-; GFX11-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
-; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], 0 offen slc dlc
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr9
-; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, s8
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX11-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], 0 offen glc slc dlc
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr8
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4
-; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
-; GFX11-SDAG-NEXT:  ; %bb.4:
+; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -330,76 +266,37 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ;
 ; GFX12-SDAG-LABEL: buffer_nontemporal_load_store:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_clause 0x2
+; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x10
+; GFX12-SDAG-NEXT:    s_mov_b32 s12, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 s7, s12
+; GFX12-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-SDAG-NEXT:    s_clause 0x1
-; GFX12-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
-; GFX12-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
-; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    s_mov_b32 s8, s1
+; GFX12-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-SDAG-NEXT:    s_clause 0x1
-; GFX12-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr9
-; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-SDAG-NEXT:  ; %bb.2:
-; GFX12-SDAG-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v4, s8
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX12-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX12-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr8
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4
-; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
-; GFX12-SDAG-NEXT:  ; %bb.4:
+; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: buffer_nontemporal_load_store:
@@ -444,30 +341,32 @@ entry:
 define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
 ; GFX9-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
 ; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
-; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s15
-; GFX9-SDAG-NEXT:    s_mov_b32 s15, s10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s14, s7
-; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s12, s5
-; GFX9-SDAG-NEXT:    s_or_b64 s[14:15], s[14:15], s[10:11]
-; GFX9-SDAG-NEXT:    s_mov_b32 s13, s6
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX9-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, s10
+; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
 ; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s10
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s7
-; GFX9-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s5
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s6
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX9-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, s10
+; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -503,68 +402,31 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX940-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
 ; GFX940-SDAG:       ; %bb.0: ; %entry
 ; GFX940-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX940-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
-; GFX940-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX940-SDAG-NEXT:    s_load_dword s7, s[4:5], 0x30
+; GFX940-SDAG-NEXT:    s_load_dword s13, s[4:5], 0x10
+; GFX940-SDAG-NEXT:    s_mov_b32 s12, 0
+; GFX940-SDAG-NEXT:    s_mov_b32 s7, s12
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
-; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[10:11], off, off offset:40
-; GFX940-SDAG-NEXT:    scratch_load_dword v4, off, off offset:36
-; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
-; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[12:13], off, off offset:8
-; GFX940-SDAG-NEXT:    s_nop 0
-; GFX940-SDAG-NEXT:    scratch_load_dword v0, off, off offset:4
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v9, s0
-; GFX940-SDAG-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(4)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v5, v10
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
-; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(1)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, v12
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
-; GFX940-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
-; GFX940-SDAG-NEXT:    s_nop 0
-; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
-; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-SDAG-NEXT:    buffer_load_dword v8, v9, s[4:7], 0 offen sc0 sc1
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr9
-; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX940-SDAG-NEXT:  ; %bb.2:
-; GFX940-SDAG-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v4, s8
-; GFX940-SDAG-NEXT:    s_mov_b64 s[0:1], exec
+; GFX940-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX940-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX940-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX940-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX940-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX940-SDAG-NEXT:    s_or_b64 s[8:9], s[2:3], s[12:13]
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-SDAG-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX940-SDAG-NEXT:    s_load_dword s13, s[4:5], 0x30
+; GFX940-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
+; GFX940-SDAG-NEXT:    s_mov_b32 s5, s12
+; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX940-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX940-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX940-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX940-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX940-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-SDAG-NEXT:    s_nop 0
-; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-SDAG-NEXT:    buffer_store_dword v8, v4, s[4:7], 0 offen sc0 sc1
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr8
-; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4
-; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
-; GFX940-SDAG-NEXT:  ; %bb.4:
+; GFX940-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
 ; GFX940-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -600,31 +462,34 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_clause 0x1
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
 ; GFX10-SDAG-NEXT:    s_mov_b32 s10, 0
-; GFX10-SDAG-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_mov_b32 s13, s10
-; GFX10-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-SDAG-NEXT:    s_mov_b32 s12, s7
-; GFX10-SDAG-NEXT:    s_or_b64 s[14:15], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT:    s_mov_b32 s12, s5
-; GFX10-SDAG-NEXT:    s_mov_b32 s13, s6
-; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-SDAG-NEXT:    s_mov_b32 s12, s1
+; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
+; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
-; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
-; GFX10-SDAG-NEXT:    s_mov_b32 s9, s10
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-SDAG-NEXT:    s_mov_b32 s8, s7
-; GFX10-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GFX10-SDAG-NEXT:    s_mov_b32 s8, s5
-; GFX10-SDAG-NEXT:    s_mov_b32 s9, s6
+; GFX10-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
+; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
+; GFX10-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX10-SDAG-NEXT:    s_mov_b32 s3, s10
+; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -660,69 +525,37 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ;
 ; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_clause 0x2
+; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
-; GFX11-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x10
+; GFX11-SDAG-NEXT:    s_mov_b32 s12, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_mov_b32 s7, s12
+; GFX11-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s8, s1
+; GFX11-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
 ; GFX11-SDAG-NEXT:    s_clause 0x1
-; GFX11-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
-; GFX11-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
-; GFX11-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
-; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
-; GFX11-SDAG-NEXT:    s_clause 0x1
-; GFX11-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
-; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], 0 offen glc dlc
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr9
-; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, s8
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX11-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], 0 offen dlc
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr8
-; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4
-; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
-; GFX11-SDAG-NEXT:  ; %bb.4:
+; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -761,77 +594,37 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ;
 ; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_clause 0x2
+; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
-; GFX12-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x10
+; GFX12-SDAG-NEXT:    s_mov_b32 s12, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 s7, s12
+; GFX12-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
-; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
-; GFX12-SDAG-NEXT:    s_clause 0x1
-; GFX12-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
-; GFX12-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
-; GFX12-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
-; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
+; GFX12-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    s_mov_b32 s8, s1
+; GFX12-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
+; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_clause 0x1
-; GFX12-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
-; GFX12-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v9, s0
+; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
-; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr9
-; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
-; GFX12-SDAG-NEXT:  ; %bb.2:
-; GFX12-SDAG-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v4, s8
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX12-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX12-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
+; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
+; GFX12-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr8
-; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4
-; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
-; GFX12-SDAG-NEXT:  ; %bb.4:
+; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index ffe9e06..5a9f53e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -330,17 +330,17 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 
 define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
 ; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
@@ -681,13 +681,25 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
+; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP16:%.*]] = load i64, ptr addrspace(1) [[TMP15]], align 2
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i64 [[TMP16]], ptr addrspace(1) [[TMP17]], align 2
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
+; OPT-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 2
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
+; OPT-NEXT:    store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
+; OPT-NEXT:    [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 2
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
+; OPT-NEXT:    store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 2
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
 ; OPT-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
@@ -731,13 +743,17 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
 ; OPT-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
@@ -754,13 +770,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 2
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT:    store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
 ; OPT-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
@@ -804,13 +824,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 4
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT:    store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
 ; OPT-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -854,13 +878,17 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT:    store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 4
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
 ; OPT-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -904,13 +932,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT:    store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
 ; OPT-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -958,17 +990,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 2
 ; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
-; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 2
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
@@ -1028,17 +1060,17 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
 ; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
@@ -1063,17 +1095,17 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 2
 ; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store i16 [[TMP6]], ptr addrspace(3) [[TMP7]], align 2
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 2
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 2
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
@@ -1098,17 +1130,17 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
 ; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
@@ -1133,17 +1165,17 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP5]], align 4
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4
 ; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
@@ -1168,17 +1200,17 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
 
 define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
+; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
 ; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
@@ -1693,10 +1725,10 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
+; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
-; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
+; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
@@ -1708,17 +1740,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 
 define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
+; MAX1024-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
 ; MAX1024-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]]
-; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; MAX1024-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]]
+; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; MAX1024:       loop-memcpy-residual:
@@ -1738,17 +1770,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
+; ALL-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]]
-; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]]
+; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; ALL:       loop-memcpy-residual:
@@ -1781,10 +1813,10 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
+; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
-; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
+; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
@@ -1796,17 +1828,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 
 define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
+; MAX1024-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
 ; MAX1024-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]]
-; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; MAX1024-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]]
+; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; MAX1024:       loop-memcpy-residual:
@@ -1826,17 +1858,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
+; ALL-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]]
-; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]]
+; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 16
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; ALL:       loop-memcpy-residual:
@@ -1871,20 +1903,20 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 8
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 256
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP6]], align 1
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -1896,7 +1928,7 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
 
 define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
 ; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
 ; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -1918,11 +1950,11 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
 ; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16
 ; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP10]], align 1
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
 ; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
 ; OPT:       memmove_copy_forward:
@@ -1930,10 +1962,10 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
 ; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
-; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16
 ; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
@@ -1965,20 +1997,20 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 8
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 256
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -1990,7 +2022,7 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds
 
 define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
 ; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
 ; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -2012,11 +2044,11 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
 ; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16
 ; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
 ; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
 ; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
 ; OPT:       memmove_copy_forward:
@@ -2024,10 +2056,10 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
 ; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 1
 ; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
-; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16
 ; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
@@ -2058,20 +2090,20 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 8
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 256
 ; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP2]], align 1
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP5]], align 1
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256
 ; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -2083,7 +2115,7 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d
 
 define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
 ; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
 ; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -2104,11 +2136,11 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
 ; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 8
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 16
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP9]], align 1
 ; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
 ; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
 ; OPT:       memmove_copy_forward:
@@ -2116,10 +2148,10 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
 ; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
+; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP12]], align 1
 ; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
-; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 16
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index a68d2e5..bc8bcc6 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -306,10 +306,10 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-LABEL: memmove_p0_p3:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v7, 7, v3
+; CHECK-NEXT:    v_and_b32_e32 v7, 15, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v8, 0
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v5, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v5, -16, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[7:8]
@@ -338,15 +338,15 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB2_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[13:14], v4
-; CHECK-NEXT:    v_add_co_u32 v11, s5, v11, -8
+; CHECK-NEXT:    ds_read_b128 v[13:16], v4
+; CHECK-NEXT:    v_add_co_u32 v11, s5, v11, -16
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
-; CHECK-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, 16, v4
 ; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[11:12]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[9:10], v[13:14]
-; CHECK-NEXT:    v_add_co_u32 v9, s6, v9, 8
+; CHECK-NEXT:    flat_store_dwordx4 v[9:10], v[13:16]
+; CHECK-NEXT:    v_add_co_u32 v9, s6, v9, 16
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_5
@@ -355,7 +355,7 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB2_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v5
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v3
@@ -414,26 +414,26 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB2_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -8
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; CHECK-NEXT:    v_add3_u32 v2, v3, v2, -8
+; CHECK-NEXT:    v_add3_u32 v2, v3, v2, -16
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB2_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[3:4], v2
-; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v5, -8
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v9, vcc_lo, v0, v5
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v1, v6, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[7:8]
-; CHECK-NEXT:    v_mov_b32_e32 v5, v7
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, -8, v2
-; CHECK-NEXT:    v_mov_b32_e32 v6, v8
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
+; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v5, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v11, vcc_lo, v0, v5
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[3:4]
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, -16, v2
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
 ; CHECK-NEXT:    s_or_b32 s7, s4, s7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[9:10], v[3:4]
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[7:10]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_15
 ; CHECK-NEXT:  .LBB2_16: ; %Flow36
@@ -1043,9 +1043,9 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
 ; CHECK-LABEL: memmove_p1_p3:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v7, -16, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v4
-; CHECK-NEXT:    v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
@@ -1056,16 +1056,16 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:  .LBB7_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[10:11], v9
-; CHECK-NEXT:    v_add_co_u32 v12, vcc_lo, v0, s4
-; CHECK-NEXT:    s_add_u32 s4, s4, 8
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    ds_read_b128 v[10:13], v9
+; CHECK-NEXT:    v_add_co_u32 v14, vcc_lo, v0, s4
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 16, v9
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx2 v[12:13], v[10:11], off
+; CHECK-NEXT:    global_store_dwordx4 v[14:15], v[10:13], off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_2
 ; CHECK-NEXT:  .LBB7_3: ; %Flow9
@@ -1076,7 +1076,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB7_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v3
@@ -1327,11 +1327,11 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
 ; CHECK-LABEL: memmove_p3_p0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -1, v0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; CHECK-NEXT:    v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v7, -16, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v4
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[5:6]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, s5, vcc_lo
@@ -1361,16 +1361,16 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB10_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx2 v[13:14], v[9:10]
-; CHECK-NEXT:    v_add_co_u32 v11, s5, v11, -8
+; CHECK-NEXT:    flat_load_dwordx4 v[13:16], v[9:10]
+; CHECK-NEXT:    v_add_co_u32 v11, s5, v11, -16
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
-; CHECK-NEXT:    v_add_co_u32 v9, s5, v9, 8
+; CHECK-NEXT:    v_add_co_u32 v9, s5, v9, 16
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
 ; CHECK-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[11:12]
 ; CHECK-NEXT:    s_or_b32 s9, s6, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v4, v[13:14]
-; CHECK-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; CHECK-NEXT:    ds_write_b128 v4, v[13:16]
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, 16, v4
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_5
 ; CHECK-NEXT:  .LBB10_6: ; %Flow34
@@ -1378,7 +1378,7 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB10_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v3, v0, v3
 ; CHECK-NEXT:    v_add_co_u32 v0, s5, v1, v7
@@ -1437,23 +1437,23 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB10_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
-; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, -8
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
+; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, -16
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
-; CHECK-NEXT:    v_add3_u32 v0, v3, v0, -8
+; CHECK-NEXT:    v_add3_u32 v0, v3, v0, -16
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:  .LBB10_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v7
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v7, -8
+; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v7, -16
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
-; CHECK-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[3:4]
 ; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8]
 ; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v0, v[3:4]
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, -8, v0
+; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, -16, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_15
 ; CHECK-NEXT:  .LBB10_16: ; %Flow36
@@ -1470,9 +1470,9 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-LABEL: memmove_p3_p1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v7, -16, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v4
-; CHECK-NEXT:    v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
@@ -1485,14 +1485,14 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
-; CHECK-NEXT:    s_add_u32 s4, s4, 8
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
-; CHECK-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
+; CHECK-NEXT:    global_load_dwordx4 v[10:13], v[10:11], off
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v9, v[10:11]
-; CHECK-NEXT:    v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT:    ds_write_b128 v9, v[10:13]
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 16, v9
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB11_2
 ; CHECK-NEXT:  .LBB11_3: ; %Flow9
@@ -1503,7 +1503,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB11_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
@@ -1538,8 +1538,8 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0
-; CHECK-NEXT:    v_and_b32_e32 v4, 7, v2
-; CHECK-NEXT:    v_and_b32_e32 v6, -8, v2
+; CHECK-NEXT:    v_and_b32_e32 v4, 15, v2
+; CHECK-NEXT:    v_and_b32_e32 v6, -16, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v3
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[4:5]
@@ -1563,15 +1563,15 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:  .LBB12_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[9:10], v3
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -8
+; CHECK-NEXT:    ds_read_b128 v[9:12], v3
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -16
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT:    v_add_nc_u32_e32 v3, 8, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v3, 16, v3
 ; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s8, s5, s8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v8, v[9:10]
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT:    ds_write_b128 v8, v[9:12]
+; CHECK-NEXT:    v_add_nc_u32_e32 v8, 16, v8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB12_5
 ; CHECK-NEXT:  .LBB12_6: ; %Flow41
@@ -1579,7 +1579,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s7, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB12_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v2, -8, v2
+; CHECK-NEXT:    v_and_b32_e32 v2, -16, v2
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
@@ -1630,24 +1630,24 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB12_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v5, -8, v2
+; CHECK-NEXT:    v_and_b32_e32 v5, -16, v2
 ; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v4, -8, v5
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -16, v5
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, v0, v4
 ; CHECK-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, v5
 ; CHECK-NEXT:    v_add_nc_u32_e32 v4, v1, v4
 ; CHECK-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
 ; CHECK-NEXT:  .LBB12_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[5:6], v4
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 8
+; CHECK-NEXT:    ds_read_b128 v[5:8], v4
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 16
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; CHECK-NEXT:    v_add_nc_u32_e32 v4, -8, v4
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -16, v4
 ; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; CHECK-NEXT:    s_or_b32 s6, vcc_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v2, v[5:6]
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, -8, v2
+; CHECK-NEXT:    ds_write_b128 v2, v[5:8]
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB12_15
 ; CHECK-NEXT:  .LBB12_16: ; %Flow43
@@ -1664,9 +1664,9 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-LABEL: memmove_p3_p4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v7, -16, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v4
-; CHECK-NEXT:    v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
@@ -1679,14 +1679,14 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
-; CHECK-NEXT:    s_add_u32 s4, s4, 8
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
-; CHECK-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
+; CHECK-NEXT:    global_load_dwordx4 v[10:13], v[10:11], off
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v9, v[10:11]
-; CHECK-NEXT:    v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT:    ds_write_b128 v9, v[10:13]
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 16, v9
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB13_2
 ; CHECK-NEXT:  .LBB13_3: ; %Flow9
@@ -1697,7 +1697,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB13_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
@@ -1735,27 +1735,30 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_and_b32_e32 v2, -8, v4
-; CHECK-NEXT:    v_and_b32_e32 v5, 7, v4
+; CHECK-NEXT:    v_and_b32_e32 v2, -16, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v4
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[2:3]
 ; CHECK-NEXT:    s_cbranch_execz .LBB14_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
+; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB14_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_clause 0x3
 ; CHECK-NEXT:    buffer_load_dword v9, v7, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v10, v7, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_add_u32 s4, s4, 8
+; CHECK-NEXT:    buffer_load_dword v11, v7, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v12, v7, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v7, 8, v7
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, 16, v7
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v8, v[9:10]
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT:    ds_write_b128 v8, v[9:12]
+; CHECK-NEXT:    v_add_nc_u32_e32 v8, 16, v8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB14_2
 ; CHECK-NEXT:  .LBB14_3: ; %Flow14
@@ -1766,7 +1769,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB14_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v2, -8, v4
+; CHECK-NEXT:    v_and_b32_e32 v2, -16, v4
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
@@ -2021,25 +2024,28 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_and_b32_e32 v2, -8, v4
-; CHECK-NEXT:    v_and_b32_e32 v5, 7, v4
+; CHECK-NEXT:    v_and_b32_e32 v2, -16, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v4
 ; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[2:3]
 ; CHECK-NEXT:    s_cbranch_execz .LBB17_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v8, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
+; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB17_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[9:10], v7
-; CHECK-NEXT:    s_add_u32 s4, s4, 8
+; CHECK-NEXT:    ds_read_b128 v[9:12], v7
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v7, 8, v7
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, 16, v7
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v12, v8, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v11, v8, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_store_dword v10, v8, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT:    v_add_nc_u32_e32 v8, 16, v8
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB17_2
@@ -2051,7 +2057,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB17_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v2, -8, v4
+; CHECK-NEXT:    v_and_b32_e32 v2, -16, v4
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index af7f927..a6db7d3 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -828,81 +828,30 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 {
 ; EG-LABEL: s_test_imin_sle_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @28, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @12
-; EG-NEXT:    ALU 9, @30, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @16
-; EG-NEXT:    ALU 10, @40, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @20
-; EG-NEXT:    ALU 10, @51, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @24
-; EG-NEXT:    ALU 11, @62, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
+; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
+; EG-NEXT:    TEX 3 @6
+; EG-NEXT:    ALU 9, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 12:
-; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 50, #3
-; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 58, #3
-; EG-NEXT:    Fetch clause starting at 16:
-; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 48, #3
-; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 56, #3
-; EG-NEXT:    Fetch clause starting at 20:
-; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 46, #3
-; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 54, #3
-; EG-NEXT:    Fetch clause starting at 24:
-; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
-; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 52, #3
-; EG-NEXT:    ALU clause starting at 28:
-; EG-NEXT:     MOV * T0.Y, T3.X,
-; EG-NEXT:     MOV * T5.X, 0.0,
-; EG-NEXT:    ALU clause starting at 30:
-; EG-NEXT:     BFE_INT T0.Z, T6.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     MIN_INT * T0.W, PV.Z, PV.W,
-; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T3.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, PV.X,
-; EG-NEXT:    ALU clause starting at 40:
-; EG-NEXT:     BFE_INT T0.Z, T6.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     MIN_INT T0.W, PV.Z, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T3.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T2.X,
-; EG-NEXT:    ALU clause starting at 51:
-; EG-NEXT:     BFE_INT T0.Z, T6.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     MIN_INT T0.W, PV.Z, PV.W,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 46, #3
+; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 52, #3
+; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 44, #3
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 54, #3
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T2.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, PV.X,
-; EG-NEXT:    ALU clause starting at 62:
-; EG-NEXT:     BFE_INT T0.Z, T6.X, 0.0, literal.x,
-; EG-NEXT:     BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     MIN_INT T0.Y, PV.Z, PV.W,
+; EG-NEXT:     BFE_INT T0.Z, T3.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT * T0.W, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     MIN_INT * T0.W, PV.Z, PV.W,
-; EG-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
-; EG-NEXT:     AND_INT T1.W, T0.Y, literal.y,
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.z,
-; EG-NEXT:    2(2.802597e-45), -65536(nan)
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T6.X, PV.W, PS,
-; EG-NEXT:     MOV T2.X, PV.X,
-; EG-NEXT:     MOV * T6.Y, T3.X,
+; EG-NEXT:     MIN_INT T0.X, PV.Z, PV.W,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CI-LABEL: s_test_imin_sle_v4i16:
 ; CI:       ; %bb.0:
@@ -1848,49 +1797,40 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 ; EG-LABEL: v_test_umin_ule_v3i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 3, @20, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 1 @8
-; EG-NEXT:    ALU 11, @24, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    TEX 3 @12
-; EG-NEXT:    ALU 8, @36, KC0[], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0
-; EG-NEXT:    MEM_RAT MSKOR T7.XW, T0.X
+; EG-NEXT:    ALU 3, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 3 @6
+; EG-NEXT:    ALU 17, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; EG-NEXT:    MEM_RAT MSKOR T4.XW, T0.X
 ; EG-NEXT:    CF_END
-; EG-NEXT:    Fetch clause starting at 8:
-; EG-NEXT:     VTX_READ_16 T7.X, T6.X, 4, #1
-; EG-NEXT:     VTX_READ_16 T8.X, T0.X, 4, #1
-; EG-NEXT:    Fetch clause starting at 12:
-; EG-NEXT:     VTX_READ_16 T8.X, T6.X, 0, #1
-; EG-NEXT:     VTX_READ_16 T9.X, T0.X, 0, #1
-; EG-NEXT:     VTX_READ_16 T6.X, T6.X, 2, #1
-; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
-; EG-NEXT:    ALU clause starting at 20:
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T2.X, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 4, #1
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
+; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
-; EG-NEXT:     ADD_INT * T6.X, KC0[2].W, PV.W,
-; EG-NEXT:    ALU clause starting at 24:
+; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
+; EG-NEXT:    ALU clause starting at 18:
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
 ; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
 ; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT * T2.W, PV.W, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     LSHL T2.W, PV.W, literal.x,
-; EG-NEXT:     MIN_UINT * T3.W, T8.X, T7.X,
+; EG-NEXT:     MIN_UINT * T3.W, T0.X, T1.X,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHL T7.X, PS, PV.W,
-; EG-NEXT:     LSHL * T7.W, literal.x, PV.W,
+; EG-NEXT:     LSHL T4.X, PS, PV.W,
+; EG-NEXT:     LSHL * T4.W, literal.x, PV.W,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     MOV * T7.Y, 0.0,
-; EG-NEXT:    ALU clause starting at 36:
-; EG-NEXT:     MOV T7.Z, 0.0,
-; EG-NEXT:     MIN_UINT * T2.W, T0.X, T6.X,
+; EG-NEXT:     MOV T4.Y, 0.0,
+; EG-NEXT:     MOV * T4.Z, 0.0,
 ; EG-NEXT:     LSHR T0.X, T1.W, literal.x,
-; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
-; EG-NEXT:     MIN_UINT * T2.W, T9.X, T8.X,
-; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
-; EG-NEXT:     OR_INT T6.X, PV.W, PS,
-; EG-NEXT:     LSHR * T8.X, T0.W, literal.x,
+; EG-NEXT:     MIN_UINT * T1.X, T3.X, T2.X,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T2.X, T0.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CI-LABEL: v_test_umin_ule_v3i16:
@@ -2936,142 +2876,46 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 {
 ; EG-LABEL: s_test_umin_ult_v8i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @52, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @20
-; EG-NEXT:    ALU 9, @54, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @24
-; EG-NEXT:    ALU 8, @64, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @28
-; EG-NEXT:    ALU 10, @73, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @32
-; EG-NEXT:    ALU 8, @84, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @36
-; EG-NEXT:    ALU 10, @93, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @40
-; EG-NEXT:    ALU 8, @104, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @44
-; EG-NEXT:    ALU 10, @113, KC0[], KC1[]
-; EG-NEXT:    TEX 1 @48
-; EG-NEXT:    ALU 10, @124, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
+; EG-NEXT:    ALU 0, @24, KC0[], KC1[]
+; EG-NEXT:    TEX 2 @8
+; EG-NEXT:    ALU 2, @25, KC0[], KC1[]
+; EG-NEXT:    TEX 4 @14
+; EG-NEXT:    ALU 14, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
-; EG-NEXT:    Fetch clause starting at 20:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
-; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 82, #3
-; EG-NEXT:    Fetch clause starting at 24:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
-; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 80, #3
-; EG-NEXT:    Fetch clause starting at 28:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
-; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 78, #3
-; EG-NEXT:    Fetch clause starting at 32:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
-; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 76, #3
-; EG-NEXT:    Fetch clause starting at 36:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
-; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 74, #3
-; EG-NEXT:    Fetch clause starting at 40:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
-; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 72, #3
-; EG-NEXT:    Fetch clause starting at 44:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
-; EG-NEXT:     VTX_READ_16 T9.X, T7.X, 70, #3
-; EG-NEXT:    Fetch clause starting at 48:
-; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 52, #3
-; EG-NEXT:     VTX_READ_16 T7.X, T7.X, 68, #3
-; EG-NEXT:    ALU clause starting at 52:
-; EG-NEXT:     MOV * T0.Y, T3.X,
-; EG-NEXT:     MOV * T7.X, 0.0,
-; EG-NEXT:    ALU clause starting at 54:
-; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T3.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, PV.X,
-; EG-NEXT:    ALU clause starting at 64:
-; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, T0.Y, literal.x,
-; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T3.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T2.X,
-; EG-NEXT:    ALU clause starting at 73:
-; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     MIN_UINT T0.W, PV.W, PS,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T2.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, PV.X,
-; EG-NEXT:    ALU clause starting at 84:
-; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, T0.Y, literal.x,
-; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
-; EG-NEXT:     MOV T2.X, PV.Z,
-; EG-NEXT:     MOV * T0.Y, T5.X,
-; EG-NEXT:    ALU clause starting at 93:
-; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     MIN_UINT T0.W, PV.W, PS,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T5.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, PV.X,
-; EG-NEXT:    ALU clause starting at 104:
-; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 62, #3
+; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 60, #3
+; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 78, #3
+; EG-NEXT:    Fetch clause starting at 14:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 68, #3
+; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 52, #3
+; EG-NEXT:     VTX_READ_16 T4.X, T0.X, 70, #3
+; EG-NEXT:     VTX_READ_16 T5.X, T0.X, 54, #3
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 76, #3
+; EG-NEXT:    ALU clause starting at 24:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 25:
+; EG-NEXT:     AND_INT T0.W, T1.X, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T3.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T2.W, T0.Y, literal.x,
-; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
-; EG-NEXT:     MOV T5.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, T4.X,
-; EG-NEXT:    ALU clause starting at 113:
-; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.X, literal.x,
+; EG-NEXT:    ALU clause starting at 28:
+; EG-NEXT:     AND_INT T0.Z, T2.X, literal.x,
+; EG-NEXT:     AND_INT T2.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     MIN_UINT * T0.W, T0.W, T1.W,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     MIN_UINT T0.W, PV.W, PS,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
+; EG-NEXT:     MIN_UINT T0.Z, PV.Z, PV.W,
+; EG-NEXT:     AND_INT T1.W, T5.X, literal.x,
+; EG-NEXT:     AND_INT * T2.W, T4.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T4.X, PV.W,
-; EG-NEXT:     MOV * T0.Y, PV.X,
-; EG-NEXT:    ALU clause starting at 124:
-; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T7.X, literal.x,
+; EG-NEXT:     MIN_UINT T0.Y, PV.W, PS,
+; EG-NEXT:     AND_INT T1.W, T3.X, literal.x,
+; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
-; EG-NEXT:     AND_INT T2.W, T0.Y, literal.y,
-; EG-NEXT:     MIN_UINT * T0.W, PV.W, PS,
-; EG-NEXT:    2(2.802597e-45), -65536(nan)
-; EG-NEXT:     OR_INT * T7.X, PV.W, PS,
-; EG-NEXT:     MOV T4.X, PV.X,
-; EG-NEXT:     MOV * T7.W, T3.X,
-; EG-NEXT:     MOV * T7.Y, T5.X,
+; EG-NEXT:     MIN_UINT T0.X, PV.W, PS,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
 ; CI-LABEL: s_test_umin_ult_v8i16:
 ; CI:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
index c375b16..7e867a5 100644
--- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_minmax_f32(float %a, float %b, float %c) {
 ; GFX12-LABEL: test_minmax_f32:
@@ -72,30 +74,84 @@ define amdgpu_ps float @test_maxmin_commuted_f32(float %a, float %b, float %c) {
 }
 
 define amdgpu_ps half @test_minmax_f16(half %a, half %b, half %c) {
-; GFX12-LABEL: test_minmax_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_maximumminimum_f16 v0, v0, v1, v2
-; GFX12-NEXT:    ; return to shader part epilog
+; SDAG-TRUE16-LABEL: test_minmax_f16:
+; SDAG-TRUE16:       ; %bb.0:
+; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-TRUE16-NEXT:    v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-FAKE16-LABEL: test_minmax_f16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    v_maximumminimum_f16 v0, v0, v1, v2
+; SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-TRUE16-LABEL: test_minmax_f16:
+; GISEL-TRUE16:       ; %bb.0:
+; GISEL-TRUE16-NEXT:    v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-FAKE16-LABEL: test_minmax_f16:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    v_maximumminimum_f16 v0, v0, v1, v2
+; GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %max = call half @llvm.maximum.f16(half %a, half %b)
   %minmax = call half @llvm.minimum.f16(half %max, half %c)
   ret half %minmax
 }
 
 define amdgpu_ps half @test_minmax_commuted_f16(half %a, half %b, half %c) {
-; GFX12-LABEL: test_minmax_commuted_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_maximumminimum_f16 v0, v0, v1, v2
-; GFX12-NEXT:    ; return to shader part epilog
+; SDAG-TRUE16-LABEL: test_minmax_commuted_f16:
+; SDAG-TRUE16:       ; %bb.0:
+; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-TRUE16-NEXT:    v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-FAKE16-LABEL: test_minmax_commuted_f16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    v_maximumminimum_f16 v0, v0, v1, v2
+; SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-TRUE16-LABEL: test_minmax_commuted_f16:
+; GISEL-TRUE16:       ; %bb.0:
+; GISEL-TRUE16-NEXT:    v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-FAKE16-LABEL: test_minmax_commuted_f16:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    v_maximumminimum_f16 v0, v0, v1, v2
+; GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %max = call half @llvm.maximum.f16(half %a, half %b)
   %minmax = call half @llvm.minimum.f16(half %c, half %max)
   ret half %minmax
 }
 
 define amdgpu_ps half @test_maxmin_commuted_f16(half %a, half %b, half %c) {
-; GFX12-LABEL: test_maxmin_commuted_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_minimummaximum_f16 v0, v0, v1, v2
-; GFX12-NEXT:    ; return to shader part epilog
+; SDAG-TRUE16-LABEL: test_maxmin_commuted_f16:
+; SDAG-TRUE16:       ; %bb.0:
+; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-TRUE16-NEXT:    v_minimummaximum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-FAKE16-LABEL: test_maxmin_commuted_f16:
+; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    v_minimummaximum_f16 v0, v0, v1, v2
+; SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-TRUE16-LABEL: test_maxmin_commuted_f16:
+; GISEL-TRUE16:       ; %bb.0:
+; GISEL-TRUE16-NEXT:    v_minimummaximum_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-FAKE16-LABEL: test_maxmin_commuted_f16:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    v_minimummaximum_f16 v0, v0, v1, v2
+; GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %min = call half @llvm.minimum.f16(half %a, half %b)
   %maxmin = call half @llvm.maximum.f16(half %c, half %min)
   ret half %maxmin
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 774a22f..954dab3 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-FAKE16 %s
 
 define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_minmax_i32:
@@ -467,47 +471,111 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
 }
 
 define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
-; GFX11-LABEL: test_minmax_f16_ieee_false:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: test_minmax_f16_ieee_false:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
-; GFX12-NEXT:    ; return to shader part epilog
+; SDAG-GFX11-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; SDAG-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-GFX11-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; SDAG-GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-GFX11-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX11-TRUE16:       ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-GFX11-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX11-FAKE16:       ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; GISEL-GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-GFX12-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX12-TRUE16:       ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; SDAG-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-GFX12-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX12-FAKE16:       ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-GFX12-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX12-TRUE16:       ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-GFX12-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX12-FAKE16:       ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-FAKE16-NEXT:    ; return to shader part epilog
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %max, half %c)
   ret half %minmax
 }
 
 define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
-; SDAG-GFX11-NEXT:    v_maxmin_f16 v0, s0, s1, v0
-; SDAG-GFX11-NEXT:    global_store_b16 v1, v0, s[4:5]
-; SDAG-GFX11-NEXT:    s_endpgm
-;
-; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false:
-; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
-; GISEL-GFX11-NEXT:    v_maxmin_f16 v0, s0, s1, v0
-; GISEL-GFX11-NEXT:    global_store_b16 v1, v0, s[6:7]
-; GISEL-GFX11-NEXT:    s_endpgm
-;
-; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG-GFX12:       ; %bb.0:
-; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
-; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
-; SDAG-GFX12-NEXT:    v_maxmin_num_f16 v0, s0, s1, v0
-; SDAG-GFX12-NEXT:    global_store_b16 v1, v0, s[4:5]
-; SDAG-GFX12-NEXT:    s_endpgm
+; SDAG-GFX11-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-GFX11-TRUE16-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-TRUE16-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, s0, s1, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
+;
+; SDAG-GFX11-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-FAKE16-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-FAKE16-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-FAKE16-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-GFX11-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX11-TRUE16:       ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GISEL-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-GFX11-TRUE16-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-TRUE16-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, s0, s1, v0.l
+; GISEL-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GISEL-GFX11-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX11-FAKE16:       ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-FAKE16-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-FAKE16-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-FAKE16-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; GISEL-GFX11-FAKE16-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX11-FAKE16-NEXT:    s_endpgm
+;
+; SDAG-GFX12-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX12-TRUE16:       ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-GFX12-TRUE16-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-TRUE16-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, s0, s1, v0.l
+; SDAG-GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX12-TRUE16-NEXT:    s_endpgm
+;
+; SDAG-GFX12-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX12-FAKE16:       ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-FAKE16-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-FAKE16-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-FAKE16-NEXT:    v_maxmin_num_f16 v0, s0, s1, v0
+; SDAG-GFX12-FAKE16-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX12-FAKE16-NEXT:    s_endpgm
 ;
 ; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false:
 ; GISEL-GFX12:       ; %bb.0:
@@ -526,136 +594,320 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 }
 
 define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; SDAG-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; SDAG-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
-; SDAG-GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GISEL-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GISEL-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
-; GISEL-GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG-GFX12:       ; %bb.0:
-; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
-; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
-; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
-; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
-; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; SDAG-GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
-; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL-GFX12:       ; %bb.0:
-; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
-; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
-; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
-; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GISEL-GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
-; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; SDAG-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-FAKE16-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; SDAG-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX11-TRUE16:       ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX11-FAKE16:       ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-FAKE16-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; GISEL-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX12-TRUE16:       ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; SDAG-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX12-FAKE16:       ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-FAKE16-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX12-TRUE16:       ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX12-FAKE16:       ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-FAKE16-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %c, half %max)
   ret half %minmax
 }
 
 define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
-; GFX11-LABEL: test_maxmin_f16_ieee_false:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: test_maxmin_f16_ieee_false:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
-; GFX12-NEXT:    ; return to shader part epilog
+; SDAG-GFX11-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; SDAG-GFX11-TRUE16-NEXT:    v_minmax_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-GFX11-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; SDAG-GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-GFX11-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX11-TRUE16:       ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT:    v_minmax_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-GFX11-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX11-FAKE16:       ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; GISEL-GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-GFX12-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX12-TRUE16:       ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; SDAG-GFX12-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; SDAG-GFX12-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX12-FAKE16:       ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-GFX12-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX12-TRUE16:       ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-GFX12-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX12-FAKE16:       ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-FAKE16-NEXT:    ; return to shader part epilog
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %min, half %c)
   ret half %maxmin
 }
 
 define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
-; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; SDAG-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; SDAG-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
-; SDAG-GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
-; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GISEL-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GISEL-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
-; GISEL-GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
-; SDAG-GFX12:       ; %bb.0:
-; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
-; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
-; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
-; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
-; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; SDAG-GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
-; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
-; GISEL-GFX12:       ; %bb.0:
-; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
-; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
-; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
-; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GISEL-GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
-; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; SDAG-GFX11-TRUE16-NEXT:    v_minmax_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-FAKE16-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; SDAG-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX11-TRUE16:       ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX11-TRUE16-NEXT:    v_minmax_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX11-FAKE16:       ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-FAKE16-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; GISEL-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX12-TRUE16:       ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; SDAG-GFX12-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX12-FAKE16:       ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-FAKE16-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX12-TRUE16:       ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX12-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX12-FAKE16:       ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-FAKE16-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %c, half %min)
   ret half %maxmin
 }
 
 define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 {
-; GFX11-LABEL: test_med3_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_med3_f16 v2, v2, v3, v4
-; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_med3_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_med3_num_f16 v2, v2, v3, v4
-; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: test_med3_f16:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
+; SDAG-GFX11-TRUE16-NEXT:    v_med3_f16 v2.l, v2.l, v2.h, v3.l
+; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: test_med3_f16:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_med3_f16 v2, v2, v3, v4
+; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; SDAG-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-TRUE16-LABEL: test_med3_f16:
+; GISEL-GFX11-TRUE16:       ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT:    v_med3_f16 v2.l, v2.l, v3.l, v4.l
+; GISEL-GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GISEL-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: test_med3_f16:
+; GISEL-GFX11-FAKE16:       ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT:    v_med3_f16 v2, v2, v3, v4
+; GISEL-GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GISEL-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-TRUE16-LABEL: test_med3_f16:
+; SDAG-GFX12-TRUE16:       ; %bb.0:
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
+; SDAG-GFX12-TRUE16-NEXT:    v_med3_num_f16 v2.l, v2.l, v2.h, v3.l
+; SDAG-GFX12-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; SDAG-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-FAKE16-LABEL: test_med3_f16:
+; SDAG-GFX12-FAKE16:       ; %bb.0:
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-FAKE16-NEXT:    v_med3_num_f16 v2, v2, v3, v4
+; SDAG-GFX12-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; SDAG-GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-TRUE16-LABEL: test_med3_f16:
+; GISEL-GFX12-TRUE16:       ; %bb.0:
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-TRUE16-NEXT:    v_med3_num_f16 v2.l, v2.l, v3.l, v4.l
+; GISEL-GFX12-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GISEL-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-FAKE16-LABEL: test_med3_f16:
+; GISEL-GFX12-FAKE16:       ; %bb.0:
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-FAKE16-NEXT:    v_med3_num_f16 v2, v2, v3, v4
+; GISEL-GFX12-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GISEL-GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
   %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
   %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index f89341d..7536e83 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -53,6 +53,7 @@
 ; CHECK-NEXT:      .cs:
 ; CHECK-NEXT:        .checksum_value: 0x9444d7d0
 ; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_cs
 ; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
@@ -109,6 +110,7 @@
 ; CHECK-NEXT:        .wgp_mode:       false
 ; CHECK-NEXT:      .gs:
 ; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_gs
 ; CHECK-NEXT:        .entry_point_symbol:    gs_shader
 ; CHECK-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0x200
@@ -120,6 +122,7 @@
 ; CHECK-NEXT:        .wgp_mode:       true
 ; CHECK-NEXT:      .hs:
 ; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_hs
 ; CHECK-NEXT:        .entry_point_symbol:    hs_shader
 ; CHECK-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0x1000
@@ -131,6 +134,7 @@
 ; CHECK-NEXT:        .wgp_mode:       true
 ; CHECK-NEXT:      .ps:
 ; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_ps
 ; CHECK-NEXT:        .entry_point_symbol:    ps_shader
 ; CHECK-NEXT:        .ieee_mode:      false
 ; CHECK-NEXT:        .lds_size:       0
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
index 676ba14..efb8d83 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-s-time.ll
@@ -7,7 +7,6 @@
 ; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=COMPATIBLE,REALTIME,MEMTIME %s
 ; RUN: FileCheck -allow-empty --check-prefixes=WARN-REALTIME,WARN-MEMTIME %s < %t
-; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s
 
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s
@@ -17,7 +16,6 @@
 ; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions %s -o - 2>%t | FileCheck -check-prefixes=INCOMPATIBLE,NOREALTIME,NOMEMTIME %s
 ; RUN: FileCheck --check-prefixes=WARN-NOREALTIME,WARN-NOMEMTIME %s < %t
-; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1102 -verify-machineinstrs < %s
 
 ; Note: This test checks the IR, but also has a run line to codegen the file just to check we
 ; do not crash when trying to select those functions.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
index 75a388e..038f49f3 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave32-feature.ll
@@ -14,7 +14,6 @@
 
 ; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1011 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
 
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
@@ -22,7 +21,6 @@
 
 ; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
 ; RUN:   -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -enable-new-pm -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s
 
 ; WARN-GFX906: removing function 'needs_wavefrontsize32': +wavefrontsize32 is not supported on the current target
 ; WARN-GFX906-NOT: not supported
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 7e7f4f5..c9efeee 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -681,63 +681,30 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; EG-LABEL: shl_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 42, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
+; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T8.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T8.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV T0.Y, T6.X,
-; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT:    ALU clause starting at 12:
-; EG-NEXT:     AND_INT * T1.W, T10.Z, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T1.W, T10.X, PV.W,
-; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T2.W, T0.Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), -65536(nan)
-; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
-; EG-NEXT:     MOV * T6.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     LSHR T1.W, T10.Z, literal.x,
-; EG-NEXT:     LSHR * T2.W, T10.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHL T1.W, PS, PV.W,
-; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     LSHR T1.W, T8.Z, literal.x,
+; EG-NEXT:     LSHR * T2.W, T8.X, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT:     MOV T6.X, PV.W,
-; EG-NEXT:     MOV * T0.X, T7.X,
-; EG-NEXT:     AND_INT * T1.W, T10.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL T1.W, T10.Y, PV.W,
-; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
+; EG-NEXT:     LSHL T0.Y, PS, PV.W,
+; EG-NEXT:     AND_INT T1.W, T8.Z, literal.x,
+; EG-NEXT:     AND_INT * T2.W, T8.X, literal.x,
 ; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
-; EG-NEXT:     MOV * T7.X, PV.W,
-; EG-NEXT:     MOV T0.X, PV.X,
-; EG-NEXT:     LSHR T1.W, T10.W, literal.x,
-; EG-NEXT:     LSHR * T2.W, T10.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T1.W, PS, PV.W,
-; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x,
-; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
+; EG-NEXT:     LSHL T0.X, PS, PV.W,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
-; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
-; EG-NEXT:     LSHR T0.X, PS, literal.x,
-; EG-NEXT:     OR_INT * T10.Y, PV.Z, PV.W,
+; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T7.X, PV.Y,
-; EG-NEXT:     MOV * T10.X, T6.X,
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i32 %tid
   %gep.out = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index ef1adbb..386a046 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -323,67 +323,28 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ;
 ; EG-LABEL: ashr_v4i16:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 48, @10, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
+; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XY, T8.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_128 T9.XYZW, T9.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 8:
-; EG-NEXT:     MOV * T0.Y, T6.X,
-; EG-NEXT:     MOV * T9.X, KC0[2].Z,
-; EG-NEXT:    ALU clause starting at 10:
-; EG-NEXT:     BFE_INT T0.W, T9.X, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.Z, literal.y,
-; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     ASHR * T0.W, PV.W, PS,
-; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT:    65535(9.183409e-41), -65536(nan)
-; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
-; EG-NEXT:     MOV * T6.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T9.X, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T1.W, T9.Z, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T0.W, PV.W, PS,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV T6.X, PV.W,
-; EG-NEXT:     MOV T0.Y, T7.X,
-; EG-NEXT:     BFE_INT T0.W, T9.Y, 0.0, literal.x,
-; EG-NEXT:     AND_INT * T1.W, T9.W, literal.y,
+; EG-NEXT:     MOV * T7.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     LSHR T0.Z, T7.X, literal.x,
+; EG-NEXT:     BFE_INT T0.W, T7.X, 0.0, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T7.Z, literal.y,
 ; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT:     ASHR T0.W, PV.W, PS,
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
-; EG-NEXT:    -65536(nan), 0(0.000000e+00)
-; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT:     MOV * T7.X, PV.W,
-; EG-NEXT:     MOV T0.Y, PV.X,
-; EG-NEXT:     LSHR * T0.W, T9.Y, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
-; EG-NEXT:     LSHR * T1.W, T9.W, literal.x,
-; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T0.W, PV.W, PS,
-; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT:     ASHR T7.X, PV.W, PS,
+; EG-NEXT:     BFE_INT T0.W, PV.Z, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.W, T7.Z, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
-; EG-NEXT:     OR_INT * T10.Y, T1.W, PV.W,
+; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ASHR * T7.Y, PV.W, PS,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     MOV T7.X, PV.Y,
-; EG-NEXT:     MOV * T10.X, T6.X,
   %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in, i16 1
   %a = load <4 x i16>, ptr addrspace(1) %in
   %b = load <4 x i16>, ptr addrspace(1) %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll b/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll
new file mode 100644
index 0000000..1c3091f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/truncate-lshr-cast-build-vector-combine.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; extract element 0 as shift
+define i32 @cast_v4i32_to_i128_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %trunc = trunc i128 %bigint to i32
+  ret i32 %trunc
+}
+
+; extract element 1 as shift
+define i32 @cast_v4i32_to_i128_lshr_32_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 32
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract element 2 as shift
+define i32 @cast_v4i32_to_i128_lshr_64_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 64
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract element 3 as shift
+define i32 @cast_v4i32_to_i128_lshr_96_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_96_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v3
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 96
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; Shift not aligned to element, not a simple extract
+define i32 @cast_v4i32_to_i128_lshr_33_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_33_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_alignbit_b32 v0, v2, v1, 1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 33
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract misaligned element
+define i32 @cast_v4i32_to_i128_lshr_31_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_31_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_alignbit_b32 v0, v1, v0, 31
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 31
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract misaligned element
+define i32 @cast_v4i32_to_i128_lshr_48_trunc_i32(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_48_trunc_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x1000706
+; CHECK-NEXT:    v_perm_b32 v0, v1, v2, s4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 48
+  %trunc = trunc i128 %srl to i32
+  ret i32 %trunc
+}
+
+; extract elements 1 and 2 with shift
+define i64 @cast_v4i32_to_i128_lshr_32_trunc_i64(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_32_trunc_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 32
+  %trunc = trunc i128 %srl to i64
+  ret i64 %trunc
+}
+
+; extract elements 2 and 3 with shift
+define i64 @cast_v4i32_to_i128_lshr_64_trunc_i64(<4 x i32> %arg) {
+; CHECK-LABEL: cast_v4i32_to_i128_lshr_64_trunc_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, v3
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %bigint = bitcast <4 x i32> %arg to i128
+  %srl = lshr i128 %bigint, 64
+  %trunc = trunc i128 %srl to i64
+  ret i64 %trunc
+}
+
+; FIXME: We don't process this case because we see multiple bitcasts
+; before a 32-bit build_vector
+define i32 @build_vector_i16_to_shift(i16 %arg0, i16 %arg1, i16 %arg2, i16 %arg3) {
+; CHECK-LABEL: build_vector_i16_to_shift:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0x5040100
+; CHECK-NEXT:    v_perm_b32 v0, v3, v2, s4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ins.0 = insertelement <4 x i16> poison, i16 %arg0, i32 0
+  %ins.1 = insertelement <4 x i16> %ins.0, i16 %arg1, i32 1
+  %ins.2 = insertelement <4 x i16> %ins.1, i16 %arg2, i32 2
+  %ins.3 = insertelement <4 x i16> %ins.2, i16 %arg3, i32 3
+
+  %cast = bitcast <4 x i16> %ins.3 to i64
+  %srl = lshr i64 %cast, 32
+  %trunc = trunc i64 %srl to i32
+  ret i32 %trunc
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
index e3a6240..fdc1e6a 100644
--- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
@@ -12,6 +12,7 @@
 ; GCN-NEXT: amdpal.pipelines:
 ; GCN-NEXT:   - .hardware_stages:
 ; GCN-NEXT:       .cs:
+; GCN-NEXT:         .entry_point:    _amdgpu_cs
 ; GCN-NEXT:         .entry_point_symbol:    _amdgpu_cs_main
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; SI-NEXT:          .sgpr_count:     0x11
diff --git a/llvm/test/CodeGen/ARM/machine-copyprop.mir b/llvm/test/CodeGen/ARM/machine-copyprop.mir
index f43c388..73f8830 100644
--- a/llvm/test/CodeGen/ARM/machine-copyprop.mir
+++ b/llvm/test/CodeGen/ARM/machine-copyprop.mir
@@ -1,5 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -o - %s -mtriple=armv7s-- -run-pass=machine-cp | FileCheck %s
+
+# RUN: llc -o - %s -mtriple=armv7s-- -passes=machine-cp | FileCheck %s
 ---
 # Test that machine copy prop recognizes the implicit-def operands on a COPY
 # as clobbering the register.
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
new file mode 100644
index 0000000..f80b5a5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
+
+declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols)
+declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols)
+declare void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %addr, i32 %ncols)
+declare void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %addr, i32 %ncols)
+
+; CHECK-LABEL: test_tcgen05_alloc
+define void @test_tcgen05_alloc(ptr %addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_alloc(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.u64 %rd1, [test_tcgen05_alloc_param_0];
+; CHECK_PTX64-NEXT:    ld.param.u32 %r1, [test_tcgen05_alloc_param_1];
+; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1;
+; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.u64 %rd1, [test_tcgen05_alloc_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r1, [test_tcgen05_alloc_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1;
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
+  call void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols)
+  call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %addr, i32 %ncols)
+
+  ret void
+}
+
+; CHECK-LABEL: test_tcgen05_alloc_shared
+define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_alloc_shared(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.u64 %rd1, [test_tcgen05_alloc_shared_param_0];
+; CHECK_PTX64-NEXT:    ld.param.u32 %r1, [test_tcgen05_alloc_shared_param_1];
+; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%rd1], %r1;
+; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%rd1], %r1;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_alloc_shared(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<3>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r1, [test_tcgen05_alloc_shared_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r2, [test_tcgen05_alloc_shared_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%r1], %r2;
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%r1], %r2;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
+  call void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %addr, i32 %ncols)
+
+  call void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %addr, i32 %ncols)
+  ret void
+}
+
+declare void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols)
+declare void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols)
+
+; CHECK-LABEL: test_tcgen05_dealloc
+define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) {
+; CHECK_PTX64-LABEL: test_tcgen05_dealloc(
+; CHECK_PTX64:       {
+; CHECK_PTX64-NEXT:    .reg .b32 %r<3>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    ld.param.u32 %r1, [test_tcgen05_dealloc_param_0];
+; CHECK_PTX64-NEXT:    ld.param.u32 %r2, [test_tcgen05_dealloc_param_1];
+; CHECK_PTX64-NEXT:    tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2;
+; CHECK_PTX64-NEXT:    tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_dealloc(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<3>;
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r1, [test_tcgen05_dealloc_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r2, [test_tcgen05_dealloc_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2;
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
+  call void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols)
+
+  call void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols)
+  ret void
+}
+
+declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1()
+declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2()
+
+; CHECK-LABEL: test_tcgen05_relinquish_alloc_permit
+define void @test_tcgen05_relinquish_alloc_permit() {
+; CHECK_PTX64-LABEL: test_tcgen05_relinquish_alloc_permit(
+; CHECK_PTX64:       {
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT:  // %bb.0:
+; CHECK_PTX64-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned;
+; CHECK_PTX64-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;
+; CHECK_PTX64-NEXT:    ret;
+;
+; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_relinquish_alloc_permit(
+; CHECK_PTX64_SHARED32:       {
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-EMPTY:
+; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned;
+; CHECK_PTX64_SHARED32-NEXT:    tcgen05.relinquish_alloc_permit.cta_group::2.sync.aligned;
+; CHECK_PTX64_SHARED32-NEXT:    ret;
+  call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1()
+
+  call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2()
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/ipra.ll b/llvm/test/CodeGen/RISCV/ipra.ll
index 717b778..0bcded9 100644
--- a/llvm/test/CodeGen/RISCV/ipra.ll
+++ b/llvm/test/CodeGen/RISCV/ipra.ll
@@ -63,26 +63,29 @@ entry:
   ret i32 0
 }
 
-; FIXME: this function calls another function but doesn't save/restore ra
 define internal void @foobar(ptr %live_throughout.0.val) norecurse nounwind {
 ; RV64-LABEL: foobar:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addi sp, sp, -48
+; RV64-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    mv a1, a0
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    addi a2, sp, 12
+; RV64-NEXT:    addi a0, sp, 8
+; RV64-NEXT:    addi a2, sp, 4
 ; RV64-NEXT:    call bmp_iter_set_init
+; RV64-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 48
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: foobar:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    addi a2, sp, 4
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    addi a2, sp, 12
 ; RV32-NEXT:    call bmp_iter_set_init
-; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 48
 ; RV32-NEXT:    ret
 ;
 ; RV64-WITHFP-LABEL: foobar:
diff --git a/llvm/test/CodeGen/RISCV/rda-stack.mir b/llvm/test/CodeGen/RISCV/rda-stack.mir
index 5f49741..b3111e66 100644
--- a/llvm/test/CodeGen/RISCV/rda-stack.mir
+++ b/llvm/test/CodeGen/RISCV/rda-stack.mir
@@ -149,3 +149,43 @@ body:             |
     $x10 = LD %stack.0, 0 :: (load (s64))
     PseudoRET implicit $x10
 ...
+---
+name:            test4
+tracksRegLiveness: true
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 16,
+      isImmutable: true, isAliased: false }
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  ; CHECK: RDA results for test4
+  ; CHECK-NEXT: $x10:{ }
+  ; CHECK-NEXT: %stack.0:{ }
+  ; CHECK-NEXT: 0: SD $x10, %stack.0, 0 :: (store (s64))
+  ; CHECK-EMPTY:
+  ; CHECK-NEXT: $x11:{ }
+  ; CHECK-NEXT: %stack.0:{ 0 }
+  ; CHECK-NEXT: 1: SD $x11, %stack.0, 0 :: (store (s64))
+  ; CHECK-EMPTY:
+  ; CHECK-NEXT: $x10:{ }
+  ; CHECK-NEXT: %stack.1:{ }
+  ; CHECK-NEXT: 2: SD $x10, %stack.1, 0 :: (store (s64))
+  ; CHECK-EMPTY:
+  ; CHECK-NEXT: $x11:{ }
+  ; CHECK-NEXT: %stack.1:{ 2 }
+  ; CHECK-NEXT: 3: SD $x11, %stack.1, 0 :: (store (s64))
+  ; CHECK-EMPTY:
+  ; CHECK-NEXT: 4: PseudoRET
+  bb.0.entry:
+    liveins: $x10, $x11
+    SD $x10, %stack.0, 0 :: (store (s64))
+    SD $x11, %stack.0, 0 :: (store (s64))
+    SD $x10, %stack.1, 0 :: (store (s64))
+    SD $x11, %stack.1, 0 :: (store (s64))
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index c6e12c5..232a364 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -2730,21 +2730,21 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
 define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v9, v8
-; RV32-NEXT:    vsll.vi v8, v9, 2
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v9, v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vluxei16.v v10, (a0), v8, v0.t
+; RV32-NEXT:    vluxei16.v v10, (a0), v9, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8i32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v9, v8
-; RV64V-NEXT:    vsll.vi v8, v9, 2
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v9, v8, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64V-NEXT:    vluxei16.v v10, (a0), v8, v0.t
+; RV64V-NEXT:    vluxei16.v v10, (a0), v9, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v10
 ; RV64V-NEXT:    ret
 ;
@@ -2888,10 +2888,11 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
 define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
 ; RV32-LABEL: mgather_baseidx_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v10, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
@@ -3037,10 +3038,11 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
 define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v10, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
@@ -3187,19 +3189,21 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
 define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vzext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v10, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8i32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64V-NEXT:    vzext.vf2 v12, v8
-; RV64V-NEXT:    vsll.vi v8, v12, 2
-; RV64V-NEXT:    vluxei32.v v10, (a0), v8, v0.t
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64V-NEXT:    vwmulu.vx v12, v8, a1
+; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV64V-NEXT:    vluxei32.v v10, (a0), v12, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v10
 ; RV64V-NEXT:    ret
 ;
@@ -3352,10 +3356,9 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m,
 ;
 ; RV64V-LABEL: mgather_baseidx_v8i32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vsext.vf2 v12, v8
-; RV64V-NEXT:    vsll.vi v12, v12, 2
-; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64V-NEXT:    vwmulsu.vx v12, v8, a1
 ; RV64V-NEXT:    vluxei64.v v10, (a0), v12, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v10
 ; RV64V-NEXT:    ret
@@ -4636,21 +4639,21 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32V-LABEL: mgather_baseidx_zext_v8i8_v8i64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32V-NEXT:    vzext.vf2 v9, v8
-; RV32V-NEXT:    vsll.vi v8, v9, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32V-NEXT:    vwmulu.vx v9, v8, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32V-NEXT:    vluxei16.v v12, (a0), v8, v0.t
+; RV32V-NEXT:    vluxei16.v v12, (a0), v9, v0.t
 ; RV32V-NEXT:    vmv.v.v v8, v12
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v9, v8
-; RV64V-NEXT:    vsll.vi v8, v9, 3
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v9, v8, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64V-NEXT:    vluxei16.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    vluxei16.v v12, (a0), v9, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
@@ -4923,11 +4926,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32V-LABEL: mgather_baseidx_v8i16_v8i64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vsext.vf2 v10, v8
-; RV32V-NEXT:    vsll.vi v8, v10, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulsu.vx v10, v8, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32V-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32V-NEXT:    vluxei32.v v12, (a0), v10, v0.t
 ; RV32V-NEXT:    vmv.v.v v8, v12
 ; RV32V-NEXT:    ret
 ;
@@ -4943,16 +4946,17 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-LABEL: mgather_baseidx_v8i16_v8i64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    andi a3, t0, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB51_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a3, v10
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
 ; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
@@ -4988,40 +4992,40 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB51_2
 ; RV32ZVE32F-NEXT:  .LBB51_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v8
 ; RV32ZVE32F-NEXT:    lw a4, 0(a5)
 ; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB51_3
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a7, v8
 ; RV32ZVE32F-NEXT:    lw a6, 0(a7)
 ; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB51_4
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s t2, v8
 ; RV32ZVE32F-NEXT:    lw t1, 0(t2)
 ; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB51_5
 ; RV32ZVE32F-NEXT:  .LBB51_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s t4, v8
 ; RV32ZVE32F-NEXT:    lw t3, 0(t4)
 ; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB51_6
 ; RV32ZVE32F-NEXT:  .LBB51_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s t6, v8
 ; RV32ZVE32F-NEXT:    lw t5, 0(t6)
 ; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB51_13: # %else14
@@ -5035,8 +5039,8 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    beqz s0, .LBB51_16
 ; RV32ZVE32F-NEXT:  # %bb.14: # %cond.load16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s s1, v8
 ; RV32ZVE32F-NEXT:    lw s0, 0(s1)
 ; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
@@ -5052,7 +5056,7 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB51_15
 ; RV32ZVE32F-NEXT:  .LBB51_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV32ZVE32F-NEXT:    lw t0, 0(a2)
 ; RV32ZVE32F-NEXT:    lw a2, 4(a2)
@@ -5201,11 +5205,11 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32V-LABEL: mgather_baseidx_sext_v8i16_v8i64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vsext.vf2 v10, v8
-; RV32V-NEXT:    vsll.vi v8, v10, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulsu.vx v10, v8, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32V-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32V-NEXT:    vluxei32.v v12, (a0), v10, v0.t
 ; RV32V-NEXT:    vmv.v.v v8, v12
 ; RV32V-NEXT:    ret
 ;
@@ -5221,16 +5225,17 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8i64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    andi a3, t0, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB52_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a3, v10
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
 ; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
@@ -5266,40 +5271,40 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB52_2
 ; RV32ZVE32F-NEXT:  .LBB52_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v8
 ; RV32ZVE32F-NEXT:    lw a4, 0(a5)
 ; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB52_3
 ; RV32ZVE32F-NEXT:  .LBB52_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a7, v8
 ; RV32ZVE32F-NEXT:    lw a6, 0(a7)
 ; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB52_4
 ; RV32ZVE32F-NEXT:  .LBB52_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s t2, v8
 ; RV32ZVE32F-NEXT:    lw t1, 0(t2)
 ; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB52_5
 ; RV32ZVE32F-NEXT:  .LBB52_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s t4, v8
 ; RV32ZVE32F-NEXT:    lw t3, 0(t4)
 ; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB52_6
 ; RV32ZVE32F-NEXT:  .LBB52_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s t6, v8
 ; RV32ZVE32F-NEXT:    lw t5, 0(t6)
 ; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB52_13: # %else14
@@ -5313,8 +5318,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz s0, .LBB52_16
 ; RV32ZVE32F-NEXT:  # %bb.14: # %cond.load16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s s1, v8
 ; RV32ZVE32F-NEXT:    lw s0, 0(s1)
 ; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
@@ -5330,7 +5335,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB52_15
 ; RV32ZVE32F-NEXT:  .LBB52_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV32ZVE32F-NEXT:    lw t0, 0(a2)
 ; RV32ZVE32F-NEXT:    lw a2, 4(a2)
@@ -5480,37 +5485,38 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) {
 ; RV32V-LABEL: mgather_baseidx_zext_v8i16_v8i64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vzext.vf2 v10, v8
-; RV32V-NEXT:    vsll.vi v8, v10, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulu.vx v10, v8, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32V-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32V-NEXT:    vluxei32.v v12, (a0), v10, v0.t
 ; RV32V-NEXT:    vmv.v.v v8, v12
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v10, v8
-; RV64V-NEXT:    vsll.vi v8, v10, 3
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64V-NEXT:    vwmulu.vx v10, v8, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64V-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    vluxei32.v v12, (a0), v10, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    andi a3, t0, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccu.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB53_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a3, v10
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
 ; RV32ZVE32F-NEXT:    lw a3, 4(a3)
 ; RV32ZVE32F-NEXT:    andi a4, t0, 2
@@ -5546,40 +5552,40 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz a4, .LBB53_2
 ; RV32ZVE32F-NEXT:  .LBB53_8: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a5, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v8
 ; RV32ZVE32F-NEXT:    lw a4, 0(a5)
 ; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    andi a6, t0, 4
 ; RV32ZVE32F-NEXT:    beqz a6, .LBB53_3
 ; RV32ZVE32F-NEXT:  .LBB53_9: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a7, v8
 ; RV32ZVE32F-NEXT:    lw a6, 0(a7)
 ; RV32ZVE32F-NEXT:    lw a7, 4(a7)
 ; RV32ZVE32F-NEXT:    andi t1, t0, 8
 ; RV32ZVE32F-NEXT:    beqz t1, .LBB53_4
 ; RV32ZVE32F-NEXT:  .LBB53_10: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s t2, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s t2, v8
 ; RV32ZVE32F-NEXT:    lw t1, 0(t2)
 ; RV32ZVE32F-NEXT:    lw t2, 4(t2)
 ; RV32ZVE32F-NEXT:    andi t3, t0, 16
 ; RV32ZVE32F-NEXT:    beqz t3, .LBB53_5
 ; RV32ZVE32F-NEXT:  .LBB53_11: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s t4, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s t4, v8
 ; RV32ZVE32F-NEXT:    lw t3, 0(t4)
 ; RV32ZVE32F-NEXT:    lw t4, 4(t4)
 ; RV32ZVE32F-NEXT:    andi t5, t0, 32
 ; RV32ZVE32F-NEXT:    beqz t5, .LBB53_6
 ; RV32ZVE32F-NEXT:  .LBB53_12: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s t6, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s t6, v8
 ; RV32ZVE32F-NEXT:    lw t5, 0(t6)
 ; RV32ZVE32F-NEXT:    lw t6, 4(t6)
 ; RV32ZVE32F-NEXT:  .LBB53_13: # %else14
@@ -5593,8 +5599,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz s0, .LBB53_16
 ; RV32ZVE32F-NEXT:  # %bb.14: # %cond.load16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s s1, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s s1, v8
 ; RV32ZVE32F-NEXT:    lw s0, 0(s1)
 ; RV32ZVE32F-NEXT:    lw s1, 4(s1)
 ; RV32ZVE32F-NEXT:    andi t0, t0, -128
@@ -5610,7 +5616,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F-NEXT:    beqz t0, .LBB53_15
 ; RV32ZVE32F-NEXT:  .LBB53_17: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV32ZVE32F-NEXT:    lw t0, 0(a2)
 ; RV32ZVE32F-NEXT:    lw a2, 4(a2)
@@ -5777,10 +5783,11 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i
 ;
 ; RV64V-LABEL: mgather_baseidx_v8i32_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64V-NEXT:    vsext.vf2 v16, v8
-; RV64V-NEXT:    vsll.vi v8, v16, 3
-; RV64V-NEXT:    vluxei64.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v16, v8, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
@@ -6053,10 +6060,11 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ;
 ; RV64V-LABEL: mgather_baseidx_sext_v8i32_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64V-NEXT:    vsext.vf2 v16, v8
-; RV64V-NEXT:    vsll.vi v8, v16, 3
-; RV64V-NEXT:    vluxei64.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v16, v8, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
@@ -6330,10 +6338,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i32_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64V-NEXT:    vzext.vf2 v16, v8
-; RV64V-NEXT:    vsll.vi v8, v16, 3
-; RV64V-NEXT:    vluxei64.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v16, v8, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
@@ -10032,21 +10041,21 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
 define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x float> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v9, v8
-; RV32-NEXT:    vsll.vi v8, v9, 2
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v9, v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vluxei16.v v10, (a0), v8, v0.t
+; RV32-NEXT:    vluxei16.v v10, (a0), v9, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8f32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v9, v8
-; RV64V-NEXT:    vsll.vi v8, v9, 2
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v9, v8, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64V-NEXT:    vluxei16.v v10, (a0), v8, v0.t
+; RV64V-NEXT:    vluxei16.v v10, (a0), v9, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v10
 ; RV64V-NEXT:    ret
 ;
@@ -10190,10 +10199,11 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
 define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) {
 ; RV32-LABEL: mgather_baseidx_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v10, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
@@ -10339,10 +10349,11 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
 define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v10, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
@@ -10489,19 +10500,21 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
 define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) {
 ; RV32-LABEL: mgather_baseidx_zext_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT:    vzext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v10, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV32-NEXT:    vluxei32.v v10, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8f32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; RV64V-NEXT:    vzext.vf2 v12, v8
-; RV64V-NEXT:    vsll.vi v8, v12, 2
-; RV64V-NEXT:    vluxei32.v v10, (a0), v8, v0.t
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64V-NEXT:    vwmulu.vx v12, v8, a1
+; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV64V-NEXT:    vluxei32.v v10, (a0), v12, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v10
 ; RV64V-NEXT:    ret
 ;
@@ -10654,10 +10667,9 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> %
 ;
 ; RV64V-LABEL: mgather_baseidx_v8f32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vsext.vf2 v12, v8
-; RV64V-NEXT:    vsll.vi v12, v12, 2
-; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64V-NEXT:    vwmulsu.vx v12, v8, a1
 ; RV64V-NEXT:    vluxei64.v v10, (a0), v12, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v10
 ; RV64V-NEXT:    ret
@@ -11702,21 +11714,21 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32V-LABEL: mgather_baseidx_zext_v8i8_v8f64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32V-NEXT:    vzext.vf2 v9, v8
-; RV32V-NEXT:    vsll.vi v8, v9, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32V-NEXT:    vwmulu.vx v9, v8, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32V-NEXT:    vluxei16.v v12, (a0), v8, v0.t
+; RV32V-NEXT:    vluxei16.v v12, (a0), v9, v0.t
 ; RV32V-NEXT:    vmv.v.v v8, v12
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v9, v8
-; RV64V-NEXT:    vsll.vi v8, v9, 3
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v9, v8, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64V-NEXT:    vluxei16.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    vluxei16.v v12, (a0), v9, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
@@ -11927,11 +11939,11 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32V-LABEL: mgather_baseidx_v8i16_v8f64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vsext.vf2 v10, v8
-; RV32V-NEXT:    vsll.vi v8, v10, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulsu.vx v10, v8, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32V-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32V-NEXT:    vluxei32.v v12, (a0), v10, v0.t
 ; RV32V-NEXT:    vmv.v.v v8, v12
 ; RV32V-NEXT:    ret
 ;
@@ -11947,38 +11959,38 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 ; RV32ZVE32F-LABEL: mgather_baseidx_v8i16_v8f64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li a2, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi a3, a1, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a2, v8
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB100_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a1, a2, 2
-; RV32ZVE32F-NEXT:    bnez a1, .LBB100_11
+; RV32ZVE32F-NEXT:    andi a2, a1, 2
+; RV32ZVE32F-NEXT:    bnez a2, .LBB100_11
 ; RV32ZVE32F-NEXT:  .LBB100_2: # %else2
-; RV32ZVE32F-NEXT:    andi a1, a2, 4
-; RV32ZVE32F-NEXT:    bnez a1, .LBB100_12
+; RV32ZVE32F-NEXT:    andi a2, a1, 4
+; RV32ZVE32F-NEXT:    bnez a2, .LBB100_12
 ; RV32ZVE32F-NEXT:  .LBB100_3: # %else5
-; RV32ZVE32F-NEXT:    andi a1, a2, 8
-; RV32ZVE32F-NEXT:    bnez a1, .LBB100_13
+; RV32ZVE32F-NEXT:    andi a2, a1, 8
+; RV32ZVE32F-NEXT:    bnez a2, .LBB100_13
 ; RV32ZVE32F-NEXT:  .LBB100_4: # %else8
-; RV32ZVE32F-NEXT:    andi a1, a2, 16
-; RV32ZVE32F-NEXT:    bnez a1, .LBB100_14
+; RV32ZVE32F-NEXT:    andi a2, a1, 16
+; RV32ZVE32F-NEXT:    bnez a2, .LBB100_14
 ; RV32ZVE32F-NEXT:  .LBB100_5: # %else11
-; RV32ZVE32F-NEXT:    andi a1, a2, 32
-; RV32ZVE32F-NEXT:    bnez a1, .LBB100_15
+; RV32ZVE32F-NEXT:    andi a2, a1, 32
+; RV32ZVE32F-NEXT:    bnez a2, .LBB100_15
 ; RV32ZVE32F-NEXT:  .LBB100_6: # %else14
-; RV32ZVE32F-NEXT:    andi a1, a2, 64
-; RV32ZVE32F-NEXT:    bnez a1, .LBB100_16
+; RV32ZVE32F-NEXT:    andi a2, a1, 64
+; RV32ZVE32F-NEXT:    bnez a2, .LBB100_16
 ; RV32ZVE32F-NEXT:  .LBB100_7: # %else17
-; RV32ZVE32F-NEXT:    andi a1, a2, -128
+; RV32ZVE32F-NEXT:    andi a1, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a1, .LBB100_9
 ; RV32ZVE32F-NEXT:  .LBB100_8: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV32ZVE32F-NEXT:    fld fa7, 0(a1)
 ; RV32ZVE32F-NEXT:  .LBB100_9: # %else20
@@ -11992,51 +12004,52 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 ; RV32ZVE32F-NEXT:    fsd fa7, 56(a0)
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB100_10: # %cond.load
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    fld fa0, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 2
-; RV32ZVE32F-NEXT:    beqz a1, .LBB100_2
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV32ZVE32F-NEXT:    fld fa0, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 2
+; RV32ZVE32F-NEXT:    beqz a2, .LBB100_2
 ; RV32ZVE32F-NEXT:  .LBB100_11: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa1, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 4
-; RV32ZVE32F-NEXT:    beqz a1, .LBB100_3
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa1, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 4
+; RV32ZVE32F-NEXT:    beqz a2, .LBB100_3
 ; RV32ZVE32F-NEXT:  .LBB100_12: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa2, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 8
-; RV32ZVE32F-NEXT:    beqz a1, .LBB100_4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa2, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 8
+; RV32ZVE32F-NEXT:    beqz a2, .LBB100_4
 ; RV32ZVE32F-NEXT:  .LBB100_13: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa3, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 16
-; RV32ZVE32F-NEXT:    beqz a1, .LBB100_5
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa3, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 16
+; RV32ZVE32F-NEXT:    beqz a2, .LBB100_5
 ; RV32ZVE32F-NEXT:  .LBB100_14: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa4, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 32
-; RV32ZVE32F-NEXT:    beqz a1, .LBB100_6
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa4, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 32
+; RV32ZVE32F-NEXT:    beqz a2, .LBB100_6
 ; RV32ZVE32F-NEXT:  .LBB100_15: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa5, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 64
-; RV32ZVE32F-NEXT:    beqz a1, .LBB100_7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa5, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 64
+; RV32ZVE32F-NEXT:    beqz a2, .LBB100_7
 ; RV32ZVE32F-NEXT:  .LBB100_16: # %cond.load16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa6, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, -128
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa6, 0(a2)
+; RV32ZVE32F-NEXT:    andi a1, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB100_8
 ; RV32ZVE32F-NEXT:    j .LBB100_9
 ;
@@ -12143,11 +12156,11 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32V-LABEL: mgather_baseidx_sext_v8i16_v8f64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vsext.vf2 v10, v8
-; RV32V-NEXT:    vsll.vi v8, v10, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulsu.vx v10, v8, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32V-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32V-NEXT:    vluxei32.v v12, (a0), v10, v0.t
 ; RV32V-NEXT:    vmv.v.v v8, v12
 ; RV32V-NEXT:    ret
 ;
@@ -12163,38 +12176,38 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8f64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li a2, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi a3, a1, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a2, v8
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB101_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a1, a2, 2
-; RV32ZVE32F-NEXT:    bnez a1, .LBB101_11
+; RV32ZVE32F-NEXT:    andi a2, a1, 2
+; RV32ZVE32F-NEXT:    bnez a2, .LBB101_11
 ; RV32ZVE32F-NEXT:  .LBB101_2: # %else2
-; RV32ZVE32F-NEXT:    andi a1, a2, 4
-; RV32ZVE32F-NEXT:    bnez a1, .LBB101_12
+; RV32ZVE32F-NEXT:    andi a2, a1, 4
+; RV32ZVE32F-NEXT:    bnez a2, .LBB101_12
 ; RV32ZVE32F-NEXT:  .LBB101_3: # %else5
-; RV32ZVE32F-NEXT:    andi a1, a2, 8
-; RV32ZVE32F-NEXT:    bnez a1, .LBB101_13
+; RV32ZVE32F-NEXT:    andi a2, a1, 8
+; RV32ZVE32F-NEXT:    bnez a2, .LBB101_13
 ; RV32ZVE32F-NEXT:  .LBB101_4: # %else8
-; RV32ZVE32F-NEXT:    andi a1, a2, 16
-; RV32ZVE32F-NEXT:    bnez a1, .LBB101_14
+; RV32ZVE32F-NEXT:    andi a2, a1, 16
+; RV32ZVE32F-NEXT:    bnez a2, .LBB101_14
 ; RV32ZVE32F-NEXT:  .LBB101_5: # %else11
-; RV32ZVE32F-NEXT:    andi a1, a2, 32
-; RV32ZVE32F-NEXT:    bnez a1, .LBB101_15
+; RV32ZVE32F-NEXT:    andi a2, a1, 32
+; RV32ZVE32F-NEXT:    bnez a2, .LBB101_15
 ; RV32ZVE32F-NEXT:  .LBB101_6: # %else14
-; RV32ZVE32F-NEXT:    andi a1, a2, 64
-; RV32ZVE32F-NEXT:    bnez a1, .LBB101_16
+; RV32ZVE32F-NEXT:    andi a2, a1, 64
+; RV32ZVE32F-NEXT:    bnez a2, .LBB101_16
 ; RV32ZVE32F-NEXT:  .LBB101_7: # %else17
-; RV32ZVE32F-NEXT:    andi a1, a2, -128
+; RV32ZVE32F-NEXT:    andi a1, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a1, .LBB101_9
 ; RV32ZVE32F-NEXT:  .LBB101_8: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV32ZVE32F-NEXT:    fld fa7, 0(a1)
 ; RV32ZVE32F-NEXT:  .LBB101_9: # %else20
@@ -12208,51 +12221,52 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F-NEXT:    fsd fa7, 56(a0)
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB101_10: # %cond.load
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    fld fa0, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 2
-; RV32ZVE32F-NEXT:    beqz a1, .LBB101_2
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV32ZVE32F-NEXT:    fld fa0, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 2
+; RV32ZVE32F-NEXT:    beqz a2, .LBB101_2
 ; RV32ZVE32F-NEXT:  .LBB101_11: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa1, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 4
-; RV32ZVE32F-NEXT:    beqz a1, .LBB101_3
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa1, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 4
+; RV32ZVE32F-NEXT:    beqz a2, .LBB101_3
 ; RV32ZVE32F-NEXT:  .LBB101_12: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa2, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 8
-; RV32ZVE32F-NEXT:    beqz a1, .LBB101_4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa2, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 8
+; RV32ZVE32F-NEXT:    beqz a2, .LBB101_4
 ; RV32ZVE32F-NEXT:  .LBB101_13: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa3, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 16
-; RV32ZVE32F-NEXT:    beqz a1, .LBB101_5
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa3, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 16
+; RV32ZVE32F-NEXT:    beqz a2, .LBB101_5
 ; RV32ZVE32F-NEXT:  .LBB101_14: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa4, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 32
-; RV32ZVE32F-NEXT:    beqz a1, .LBB101_6
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa4, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 32
+; RV32ZVE32F-NEXT:    beqz a2, .LBB101_6
 ; RV32ZVE32F-NEXT:  .LBB101_15: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa5, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 64
-; RV32ZVE32F-NEXT:    beqz a1, .LBB101_7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa5, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 64
+; RV32ZVE32F-NEXT:    beqz a2, .LBB101_7
 ; RV32ZVE32F-NEXT:  .LBB101_16: # %cond.load16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa6, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, -128
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa6, 0(a2)
+; RV32ZVE32F-NEXT:    andi a1, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB101_8
 ; RV32ZVE32F-NEXT:    j .LBB101_9
 ;
@@ -12360,59 +12374,59 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) {
 ; RV32V-LABEL: mgather_baseidx_zext_v8i16_v8f64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vzext.vf2 v10, v8
-; RV32V-NEXT:    vsll.vi v8, v10, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulu.vx v10, v8, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32V-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32V-NEXT:    vluxei32.v v12, (a0), v10, v0.t
 ; RV32V-NEXT:    vmv.v.v v8, v12
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v10, v8
-; RV64V-NEXT:    vsll.vi v8, v10, 3
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64V-NEXT:    vwmulu.vx v10, v8, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64V-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    vluxei32.v v12, (a0), v10, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8f64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li a2, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi a3, a1, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccu.vx v10, a2, v8
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB102_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a1, a2, 2
-; RV32ZVE32F-NEXT:    bnez a1, .LBB102_11
+; RV32ZVE32F-NEXT:    andi a2, a1, 2
+; RV32ZVE32F-NEXT:    bnez a2, .LBB102_11
 ; RV32ZVE32F-NEXT:  .LBB102_2: # %else2
-; RV32ZVE32F-NEXT:    andi a1, a2, 4
-; RV32ZVE32F-NEXT:    bnez a1, .LBB102_12
+; RV32ZVE32F-NEXT:    andi a2, a1, 4
+; RV32ZVE32F-NEXT:    bnez a2, .LBB102_12
 ; RV32ZVE32F-NEXT:  .LBB102_3: # %else5
-; RV32ZVE32F-NEXT:    andi a1, a2, 8
-; RV32ZVE32F-NEXT:    bnez a1, .LBB102_13
+; RV32ZVE32F-NEXT:    andi a2, a1, 8
+; RV32ZVE32F-NEXT:    bnez a2, .LBB102_13
 ; RV32ZVE32F-NEXT:  .LBB102_4: # %else8
-; RV32ZVE32F-NEXT:    andi a1, a2, 16
-; RV32ZVE32F-NEXT:    bnez a1, .LBB102_14
+; RV32ZVE32F-NEXT:    andi a2, a1, 16
+; RV32ZVE32F-NEXT:    bnez a2, .LBB102_14
 ; RV32ZVE32F-NEXT:  .LBB102_5: # %else11
-; RV32ZVE32F-NEXT:    andi a1, a2, 32
-; RV32ZVE32F-NEXT:    bnez a1, .LBB102_15
+; RV32ZVE32F-NEXT:    andi a2, a1, 32
+; RV32ZVE32F-NEXT:    bnez a2, .LBB102_15
 ; RV32ZVE32F-NEXT:  .LBB102_6: # %else14
-; RV32ZVE32F-NEXT:    andi a1, a2, 64
-; RV32ZVE32F-NEXT:    bnez a1, .LBB102_16
+; RV32ZVE32F-NEXT:    andi a2, a1, 64
+; RV32ZVE32F-NEXT:    bnez a2, .LBB102_16
 ; RV32ZVE32F-NEXT:  .LBB102_7: # %else17
-; RV32ZVE32F-NEXT:    andi a1, a2, -128
+; RV32ZVE32F-NEXT:    andi a1, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a1, .LBB102_9
 ; RV32ZVE32F-NEXT:  .LBB102_8: # %cond.load19
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV32ZVE32F-NEXT:    fld fa7, 0(a1)
 ; RV32ZVE32F-NEXT:  .LBB102_9: # %else20
@@ -12426,51 +12440,52 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F-NEXT:    fsd fa7, 56(a0)
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB102_10: # %cond.load
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    fld fa0, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 2
-; RV32ZVE32F-NEXT:    beqz a1, .LBB102_2
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v10
+; RV32ZVE32F-NEXT:    fld fa0, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 2
+; RV32ZVE32F-NEXT:    beqz a2, .LBB102_2
 ; RV32ZVE32F-NEXT:  .LBB102_11: # %cond.load1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa1, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 4
-; RV32ZVE32F-NEXT:    beqz a1, .LBB102_3
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa1, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 4
+; RV32ZVE32F-NEXT:    beqz a2, .LBB102_3
 ; RV32ZVE32F-NEXT:  .LBB102_12: # %cond.load4
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa2, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 8
-; RV32ZVE32F-NEXT:    beqz a1, .LBB102_4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa2, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 8
+; RV32ZVE32F-NEXT:    beqz a2, .LBB102_4
 ; RV32ZVE32F-NEXT:  .LBB102_13: # %cond.load7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa3, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 16
-; RV32ZVE32F-NEXT:    beqz a1, .LBB102_5
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa3, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 16
+; RV32ZVE32F-NEXT:    beqz a2, .LBB102_5
 ; RV32ZVE32F-NEXT:  .LBB102_14: # %cond.load10
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa4, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 32
-; RV32ZVE32F-NEXT:    beqz a1, .LBB102_6
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa4, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 32
+; RV32ZVE32F-NEXT:    beqz a2, .LBB102_6
 ; RV32ZVE32F-NEXT:  .LBB102_15: # %cond.load13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa5, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, 64
-; RV32ZVE32F-NEXT:    beqz a1, .LBB102_7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa5, 0(a2)
+; RV32ZVE32F-NEXT:    andi a2, a1, 64
+; RV32ZVE32F-NEXT:    beqz a2, .LBB102_7
 ; RV32ZVE32F-NEXT:  .LBB102_16: # %cond.load16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
-; RV32ZVE32F-NEXT:    fld fa6, 0(a1)
-; RV32ZVE32F-NEXT:    andi a1, a2, -128
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    fld fa6, 0(a2)
+; RV32ZVE32F-NEXT:    andi a1, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a1, .LBB102_8
 ; RV32ZVE32F-NEXT:    j .LBB102_9
 ;
@@ -12595,10 +12610,11 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8
 ;
 ; RV64V-LABEL: mgather_baseidx_v8i32_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64V-NEXT:    vsext.vf2 v16, v8
-; RV64V-NEXT:    vsll.vi v8, v16, 3
-; RV64V-NEXT:    vluxei64.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v16, v8, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
@@ -12809,10 +12825,11 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs
 ;
 ; RV64V-LABEL: mgather_baseidx_sext_v8i32_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64V-NEXT:    vsext.vf2 v16, v8
-; RV64V-NEXT:    vsll.vi v8, v16, 3
-; RV64V-NEXT:    vluxei64.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v16, v8, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
@@ -13024,10 +13041,11 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs
 ;
 ; RV64V-LABEL: mgather_baseidx_zext_v8i32_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64V-NEXT:    vzext.vf2 v16, v8
-; RV64V-NEXT:    vsll.vi v8, v16, 3
-; RV64V-NEXT:    vluxei64.v v12, (a0), v8, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v16, v8, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    vmv.v.v v8, v12
 ; RV64V-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 7ec4726..7fe1406 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -2148,20 +2148,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
 define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v11, v10
-; RV32-NEXT:    vsll.vi v10, v11, 2
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v11, v10, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v11, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8i32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v11, v10
-; RV64V-NEXT:    vsll.vi v10, v11, 2
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v11, v10, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; RV64V-NEXT:    vsoxei16.v v8, (a0), v11, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i32:
@@ -2295,10 +2295,11 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
 define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v10, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_v8i16_v8i32:
@@ -2433,10 +2434,11 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %
 define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v10, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_sext_v8i16_v8i32:
@@ -2572,18 +2574,20 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
 define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v10, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i16_v8i32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v12, v10
-; RV64V-NEXT:    vsll.vi v10, v12, 2
-; RV64V-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64V-NEXT:    vwmulu.vx v12, v10, a1
+; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i32:
@@ -2725,10 +2729,9 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs,
 ;
 ; RV64V-LABEL: mscatter_baseidx_v8i32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vsext.vf2 v12, v10
-; RV64V-NEXT:    vsll.vi v12, v12, 2
-; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v12, v10, a1
 ; RV64V-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
 ; RV64V-NEXT:    ret
 ;
@@ -3881,20 +3884,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) {
 ; RV32V-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32V-NEXT:    vzext.vf2 v13, v12
-; RV32V-NEXT:    vsll.vi v12, v13, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32V-NEXT:    vwmulu.vx v13, v12, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV32V-NEXT:    vsoxei16.v v8, (a0), v13, v0.t
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v13, v12
-; RV64V-NEXT:    vsll.vi v12, v13, 3
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v13, v12, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    vsoxei16.v v8, (a0), v13, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64:
@@ -4141,11 +4144,11 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
 define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32V-LABEL: mscatter_baseidx_v8i16_v8i64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vsext.vf2 v14, v12
-; RV32V-NEXT:    vsll.vi v12, v14, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulsu.vx v14, v12, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32V-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_v8i16_v8i64:
@@ -4173,47 +4176,47 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    lw a7, 44(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 48(a0)
 ; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 28(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 36(a0)
-; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li s1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi s2, a1, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, s1, v8
 ; RV32ZVE32F-NEXT:    bnez s2, .LBB45_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_11
 ; RV32ZVE32F-NEXT:  .LBB45_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_12
 ; RV32ZVE32F-NEXT:  .LBB45_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_13
 ; RV32ZVE32F-NEXT:  .LBB45_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_14
 ; RV32ZVE32F-NEXT:  .LBB45_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_15
 ; RV32ZVE32F-NEXT:  .LBB45_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_16
 ; RV32ZVE32F-NEXT:  .LBB45_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_9
 ; RV32ZVE32F-NEXT:  .LBB45_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a2, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a3, 4(a0)
@@ -4229,60 +4232,61 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB45_10: # %cond.store
 ; RV32ZVE32F-NEXT:    .cfi_restore_state
-; RV32ZVE32F-NEXT:    lw a1, 0(a0)
+; RV32ZVE32F-NEXT:    lw s1, 0(a0)
 ; RV32ZVE32F-NEXT:    lw a0, 4(a0)
-; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
-; RV32ZVE32F-NEXT:    sw a1, 0(s2)
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s s2, v10
+; RV32ZVE32F-NEXT:    sw s1, 0(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 4(s2)
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_2
 ; RV32ZVE32F-NEXT:  .LBB45_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    sw s1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t6, 0(a0)
+; RV32ZVE32F-NEXT:    sw s0, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_3
 ; RV32ZVE32F-NEXT:  .LBB45_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_4
 ; RV32ZVE32F-NEXT:  .LBB45_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_5
 ; RV32ZVE32F-NEXT:  .LBB45_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_6
 ; RV32ZVE32F-NEXT:  .LBB45_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a6, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a7, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB45_7
 ; RV32ZVE32F-NEXT:  .LBB45_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a4, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB45_8
 ; RV32ZVE32F-NEXT:    j .LBB45_9
 ;
@@ -4392,11 +4396,11 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
 define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32V-LABEL: mscatter_baseidx_sext_v8i16_v8i64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vsext.vf2 v14, v12
-; RV32V-NEXT:    vsll.vi v12, v14, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulsu.vx v14, v12, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32V-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_sext_v8i16_v8i64:
@@ -4424,47 +4428,47 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    lw a7, 44(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 48(a0)
 ; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 28(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 36(a0)
-; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li s1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi s2, a1, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, s1, v8
 ; RV32ZVE32F-NEXT:    bnez s2, .LBB46_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_11
 ; RV32ZVE32F-NEXT:  .LBB46_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_12
 ; RV32ZVE32F-NEXT:  .LBB46_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_13
 ; RV32ZVE32F-NEXT:  .LBB46_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_14
 ; RV32ZVE32F-NEXT:  .LBB46_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_15
 ; RV32ZVE32F-NEXT:  .LBB46_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_16
 ; RV32ZVE32F-NEXT:  .LBB46_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_9
 ; RV32ZVE32F-NEXT:  .LBB46_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a2, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a3, 4(a0)
@@ -4480,60 +4484,61 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB46_10: # %cond.store
 ; RV32ZVE32F-NEXT:    .cfi_restore_state
-; RV32ZVE32F-NEXT:    lw a1, 0(a0)
+; RV32ZVE32F-NEXT:    lw s1, 0(a0)
 ; RV32ZVE32F-NEXT:    lw a0, 4(a0)
-; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
-; RV32ZVE32F-NEXT:    sw a1, 0(s2)
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s s2, v10
+; RV32ZVE32F-NEXT:    sw s1, 0(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 4(s2)
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_2
 ; RV32ZVE32F-NEXT:  .LBB46_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    sw s1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t6, 0(a0)
+; RV32ZVE32F-NEXT:    sw s0, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_3
 ; RV32ZVE32F-NEXT:  .LBB46_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_4
 ; RV32ZVE32F-NEXT:  .LBB46_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_5
 ; RV32ZVE32F-NEXT:  .LBB46_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_6
 ; RV32ZVE32F-NEXT:  .LBB46_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a6, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a7, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB46_7
 ; RV32ZVE32F-NEXT:  .LBB46_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a4, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB46_8
 ; RV32ZVE32F-NEXT:    j .LBB46_9
 ;
@@ -4644,20 +4649,20 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32V-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vzext.vf2 v14, v12
-; RV32V-NEXT:    vsll.vi v12, v14, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulu.vx v14, v12, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32V-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v14, v12
-; RV64V-NEXT:    vsll.vi v12, v14, 3
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64V-NEXT:    vwmulu.vx v14, v12, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64:
@@ -4677,47 +4682,47 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    lw a7, 44(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 48(a0)
 ; RV32ZVE32F-NEXT:    lw a5, 52(a0)
-; RV32ZVE32F-NEXT:    lw t3, 24(a0)
-; RV32ZVE32F-NEXT:    lw t4, 28(a0)
-; RV32ZVE32F-NEXT:    lw t1, 32(a0)
-; RV32ZVE32F-NEXT:    lw t2, 36(a0)
-; RV32ZVE32F-NEXT:    lw s0, 8(a0)
-; RV32ZVE32F-NEXT:    lw s1, 12(a0)
-; RV32ZVE32F-NEXT:    lw t5, 16(a0)
-; RV32ZVE32F-NEXT:    lw t6, 20(a0)
+; RV32ZVE32F-NEXT:    lw t2, 24(a0)
+; RV32ZVE32F-NEXT:    lw t3, 28(a0)
+; RV32ZVE32F-NEXT:    lw t0, 32(a0)
+; RV32ZVE32F-NEXT:    lw t1, 36(a0)
+; RV32ZVE32F-NEXT:    lw t6, 8(a0)
+; RV32ZVE32F-NEXT:    lw s0, 12(a0)
+; RV32ZVE32F-NEXT:    lw t4, 16(a0)
+; RV32ZVE32F-NEXT:    lw t5, 20(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a1
+; RV32ZVE32F-NEXT:    li s1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi s2, t0, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    andi s2, a1, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccu.vx v10, s1, v8
 ; RV32ZVE32F-NEXT:    bnez s2, .LBB47_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_11
 ; RV32ZVE32F-NEXT:  .LBB47_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_12
 ; RV32ZVE32F-NEXT:  .LBB47_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_13
 ; RV32ZVE32F-NEXT:  .LBB47_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_14
 ; RV32ZVE32F-NEXT:  .LBB47_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_15
 ; RV32ZVE32F-NEXT:  .LBB47_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_16
 ; RV32ZVE32F-NEXT:  .LBB47_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_9
 ; RV32ZVE32F-NEXT:  .LBB47_8: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a2, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a3, 4(a0)
@@ -4733,60 +4738,61 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB47_10: # %cond.store
 ; RV32ZVE32F-NEXT:    .cfi_restore_state
-; RV32ZVE32F-NEXT:    lw a1, 0(a0)
+; RV32ZVE32F-NEXT:    lw s1, 0(a0)
 ; RV32ZVE32F-NEXT:    lw a0, 4(a0)
-; RV32ZVE32F-NEXT:    vmv.x.s s2, v8
-; RV32ZVE32F-NEXT:    sw a1, 0(s2)
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s s2, v10
+; RV32ZVE32F-NEXT:    sw s1, 0(s2)
 ; RV32ZVE32F-NEXT:    sw a0, 4(s2)
-; RV32ZVE32F-NEXT:    andi a0, t0, 2
+; RV32ZVE32F-NEXT:    andi a0, a1, 2
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_2
 ; RV32ZVE32F-NEXT:  .LBB47_11: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw s0, 0(a0)
-; RV32ZVE32F-NEXT:    sw s1, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t6, 0(a0)
+; RV32ZVE32F-NEXT:    sw s0, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 4
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_3
 ; RV32ZVE32F-NEXT:  .LBB47_12: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t5, 0(a0)
-; RV32ZVE32F-NEXT:    sw t6, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 8
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t4, 0(a0)
+; RV32ZVE32F-NEXT:    sw t5, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 8
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_4
 ; RV32ZVE32F-NEXT:  .LBB47_13: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t3, 0(a0)
-; RV32ZVE32F-NEXT:    sw t4, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 16
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t2, 0(a0)
+; RV32ZVE32F-NEXT:    sw t3, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 16
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_5
 ; RV32ZVE32F-NEXT:  .LBB47_14: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    sw t1, 0(a0)
-; RV32ZVE32F-NEXT:    sw t2, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 32
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
+; RV32ZVE32F-NEXT:    sw t0, 0(a0)
+; RV32ZVE32F-NEXT:    sw t1, 4(a0)
+; RV32ZVE32F-NEXT:    andi a0, a1, 32
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_6
 ; RV32ZVE32F-NEXT:  .LBB47_15: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a6, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a7, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, 64
+; RV32ZVE32F-NEXT:    andi a0, a1, 64
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB47_7
 ; RV32ZVE32F-NEXT:  .LBB47_16: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    sw a4, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a5, 4(a0)
-; RV32ZVE32F-NEXT:    andi a0, t0, -128
+; RV32ZVE32F-NEXT:    andi a0, a1, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB47_8
 ; RV32ZVE32F-NEXT:    j .LBB47_9
 ;
@@ -4913,10 +4919,11 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> %
 ;
 ; RV64V-LABEL: mscatter_baseidx_v8i32_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vsext.vf2 v16, v12
-; RV64V-NEXT:    vsll.vi v12, v16, 3
-; RV64V-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v16, v12, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64:
@@ -5162,10 +5169,11 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ;
 ; RV64V-LABEL: mscatter_baseidx_sext_v8i32_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vsext.vf2 v16, v12
-; RV64V-NEXT:    vsll.vi v12, v16, 3
-; RV64V-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v16, v12, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64:
@@ -5412,10 +5420,11 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i32_v8i64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vzext.vf2 v16, v12
-; RV64V-NEXT:    vsll.vi v12, v16, 3
-; RV64V-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v16, v12, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64:
@@ -9019,20 +9028,20 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
 define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v11, v10
-; RV32-NEXT:    vsll.vi v10, v11, 2
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v11, v10, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v11, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8f32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v11, v10
-; RV64V-NEXT:    vsll.vi v10, v11, 2
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v11, v10, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64V-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; RV64V-NEXT:    vsoxei16.v v8, (a0), v11, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8f32:
@@ -9166,10 +9175,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
 define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v10, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_v8i16_v8f32:
@@ -9304,10 +9314,11 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16>
 define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v10, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_sext_v8i16_v8f32:
@@ -9443,18 +9454,20 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
 define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v10, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i16_v8f32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v12, v10
-; RV64V-NEXT:    vsll.vi v10, v12, 2
-; RV64V-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64V-NEXT:    vwmulu.vx v12, v10, a1
+; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f32:
@@ -9596,10 +9609,9 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idxs
 ;
 ; RV64V-LABEL: mscatter_baseidx_v8f32:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vsext.vf2 v12, v10
-; RV64V-NEXT:    vsll.vi v12, v12, 2
-; RV64V-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64V-NEXT:    li a1, 4
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v12, v10, a1
 ; RV64V-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
 ; RV64V-NEXT:    ret
 ;
@@ -10545,20 +10557,20 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) {
 ; RV32V-LABEL: mscatter_baseidx_zext_v8i8_v8f64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32V-NEXT:    vzext.vf2 v13, v12
-; RV32V-NEXT:    vsll.vi v12, v13, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32V-NEXT:    vwmulu.vx v13, v12, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV32V-NEXT:    vsoxei16.v v8, (a0), v13, v0.t
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vzext.vf2 v13, v12
-; RV64V-NEXT:    vsll.vi v12, v13, 3
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v13, v12, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    vsoxei16.v v8, (a0), v13, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8f64:
@@ -10755,11 +10767,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32V-LABEL: mscatter_baseidx_v8i16_v8f64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vsext.vf2 v14, v12
-; RV32V-NEXT:    vsll.vi v12, v14, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulsu.vx v14, v12, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32V-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_v8i16_v8f64:
@@ -10773,87 +10785,88 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i16_v8f64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a0
+; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v0
+; RV32ZVE32F-NEXT:    andi a2, a0, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB94_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a1, 2
-; RV32ZVE32F-NEXT:    bnez a0, .LBB94_10
+; RV32ZVE32F-NEXT:    andi a1, a0, 2
+; RV32ZVE32F-NEXT:    bnez a1, .LBB94_10
 ; RV32ZVE32F-NEXT:  .LBB94_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a1, 4
-; RV32ZVE32F-NEXT:    bnez a0, .LBB94_11
+; RV32ZVE32F-NEXT:    andi a1, a0, 4
+; RV32ZVE32F-NEXT:    bnez a1, .LBB94_11
 ; RV32ZVE32F-NEXT:  .LBB94_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a1, 8
-; RV32ZVE32F-NEXT:    bnez a0, .LBB94_12
+; RV32ZVE32F-NEXT:    andi a1, a0, 8
+; RV32ZVE32F-NEXT:    bnez a1, .LBB94_12
 ; RV32ZVE32F-NEXT:  .LBB94_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a1, 16
-; RV32ZVE32F-NEXT:    bnez a0, .LBB94_13
+; RV32ZVE32F-NEXT:    andi a1, a0, 16
+; RV32ZVE32F-NEXT:    bnez a1, .LBB94_13
 ; RV32ZVE32F-NEXT:  .LBB94_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a1, 32
-; RV32ZVE32F-NEXT:    bnez a0, .LBB94_14
+; RV32ZVE32F-NEXT:    andi a1, a0, 32
+; RV32ZVE32F-NEXT:    bnez a1, .LBB94_14
 ; RV32ZVE32F-NEXT:  .LBB94_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a1, 64
-; RV32ZVE32F-NEXT:    bnez a0, .LBB94_15
+; RV32ZVE32F-NEXT:    andi a1, a0, 64
+; RV32ZVE32F-NEXT:    bnez a1, .LBB94_15
 ; RV32ZVE32F-NEXT:  .LBB94_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a1, -128
+; RV32ZVE32F-NEXT:    andi a0, a0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB94_16
 ; RV32ZVE32F-NEXT:  .LBB94_8: # %else14
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB94_9: # %cond.store
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    fsd fa0, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 2
-; RV32ZVE32F-NEXT:    beqz a0, .LBB94_2
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV32ZVE32F-NEXT:    fsd fa0, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 2
+; RV32ZVE32F-NEXT:    beqz a1, .LBB94_2
 ; RV32ZVE32F-NEXT:  .LBB94_10: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa1, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 4
-; RV32ZVE32F-NEXT:    beqz a0, .LBB94_3
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa1, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 4
+; RV32ZVE32F-NEXT:    beqz a1, .LBB94_3
 ; RV32ZVE32F-NEXT:  .LBB94_11: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa2, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 8
-; RV32ZVE32F-NEXT:    beqz a0, .LBB94_4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa2, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 8
+; RV32ZVE32F-NEXT:    beqz a1, .LBB94_4
 ; RV32ZVE32F-NEXT:  .LBB94_12: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa3, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 16
-; RV32ZVE32F-NEXT:    beqz a0, .LBB94_5
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa3, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 16
+; RV32ZVE32F-NEXT:    beqz a1, .LBB94_5
 ; RV32ZVE32F-NEXT:  .LBB94_13: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa4, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 32
-; RV32ZVE32F-NEXT:    beqz a0, .LBB94_6
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa4, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 32
+; RV32ZVE32F-NEXT:    beqz a1, .LBB94_6
 ; RV32ZVE32F-NEXT:  .LBB94_14: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa5, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 64
-; RV32ZVE32F-NEXT:    beqz a0, .LBB94_7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa5, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 64
+; RV32ZVE32F-NEXT:    beqz a1, .LBB94_7
 ; RV32ZVE32F-NEXT:  .LBB94_15: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa6, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, -128
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa6, 0(a1)
+; RV32ZVE32F-NEXT:    andi a0, a0, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB94_8
 ; RV32ZVE32F-NEXT:  .LBB94_16: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    fsd fa7, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
@@ -10956,11 +10969,11 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16
 define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32V-LABEL: mscatter_baseidx_sext_v8i16_v8f64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vsext.vf2 v14, v12
-; RV32V-NEXT:    vsll.vi v12, v14, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulsu.vx v14, v12, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32V-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_sext_v8i16_v8f64:
@@ -10974,87 +10987,88 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8f64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a0
+; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v0
+; RV32ZVE32F-NEXT:    andi a2, a0, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccus.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB95_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a1, 2
-; RV32ZVE32F-NEXT:    bnez a0, .LBB95_10
+; RV32ZVE32F-NEXT:    andi a1, a0, 2
+; RV32ZVE32F-NEXT:    bnez a1, .LBB95_10
 ; RV32ZVE32F-NEXT:  .LBB95_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a1, 4
-; RV32ZVE32F-NEXT:    bnez a0, .LBB95_11
+; RV32ZVE32F-NEXT:    andi a1, a0, 4
+; RV32ZVE32F-NEXT:    bnez a1, .LBB95_11
 ; RV32ZVE32F-NEXT:  .LBB95_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a1, 8
-; RV32ZVE32F-NEXT:    bnez a0, .LBB95_12
+; RV32ZVE32F-NEXT:    andi a1, a0, 8
+; RV32ZVE32F-NEXT:    bnez a1, .LBB95_12
 ; RV32ZVE32F-NEXT:  .LBB95_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a1, 16
-; RV32ZVE32F-NEXT:    bnez a0, .LBB95_13
+; RV32ZVE32F-NEXT:    andi a1, a0, 16
+; RV32ZVE32F-NEXT:    bnez a1, .LBB95_13
 ; RV32ZVE32F-NEXT:  .LBB95_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a1, 32
-; RV32ZVE32F-NEXT:    bnez a0, .LBB95_14
+; RV32ZVE32F-NEXT:    andi a1, a0, 32
+; RV32ZVE32F-NEXT:    bnez a1, .LBB95_14
 ; RV32ZVE32F-NEXT:  .LBB95_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a1, 64
-; RV32ZVE32F-NEXT:    bnez a0, .LBB95_15
+; RV32ZVE32F-NEXT:    andi a1, a0, 64
+; RV32ZVE32F-NEXT:    bnez a1, .LBB95_15
 ; RV32ZVE32F-NEXT:  .LBB95_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a1, -128
+; RV32ZVE32F-NEXT:    andi a0, a0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB95_16
 ; RV32ZVE32F-NEXT:  .LBB95_8: # %else14
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB95_9: # %cond.store
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    fsd fa0, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 2
-; RV32ZVE32F-NEXT:    beqz a0, .LBB95_2
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV32ZVE32F-NEXT:    fsd fa0, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 2
+; RV32ZVE32F-NEXT:    beqz a1, .LBB95_2
 ; RV32ZVE32F-NEXT:  .LBB95_10: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa1, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 4
-; RV32ZVE32F-NEXT:    beqz a0, .LBB95_3
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa1, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 4
+; RV32ZVE32F-NEXT:    beqz a1, .LBB95_3
 ; RV32ZVE32F-NEXT:  .LBB95_11: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa2, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 8
-; RV32ZVE32F-NEXT:    beqz a0, .LBB95_4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa2, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 8
+; RV32ZVE32F-NEXT:    beqz a1, .LBB95_4
 ; RV32ZVE32F-NEXT:  .LBB95_12: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa3, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 16
-; RV32ZVE32F-NEXT:    beqz a0, .LBB95_5
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa3, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 16
+; RV32ZVE32F-NEXT:    beqz a1, .LBB95_5
 ; RV32ZVE32F-NEXT:  .LBB95_13: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa4, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 32
-; RV32ZVE32F-NEXT:    beqz a0, .LBB95_6
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa4, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 32
+; RV32ZVE32F-NEXT:    beqz a1, .LBB95_6
 ; RV32ZVE32F-NEXT:  .LBB95_14: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa5, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 64
-; RV32ZVE32F-NEXT:    beqz a0, .LBB95_7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa5, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 64
+; RV32ZVE32F-NEXT:    beqz a1, .LBB95_7
 ; RV32ZVE32F-NEXT:  .LBB95_15: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa6, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, -128
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa6, 0(a1)
+; RV32ZVE32F-NEXT:    andi a0, a0, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB95_8
 ; RV32ZVE32F-NEXT:  .LBB95_16: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    fsd fa7, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
@@ -11158,106 +11172,107 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) {
 ; RV32V-LABEL: mscatter_baseidx_zext_v8i16_v8f64:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32V-NEXT:    vzext.vf2 v14, v12
-; RV32V-NEXT:    vsll.vi v12, v14, 3
+; RV32V-NEXT:    li a1, 8
+; RV32V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32V-NEXT:    vwmulu.vx v14, v12, a1
 ; RV32V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV32V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32V-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i16_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64V-NEXT:    vzext.vf2 v14, v12
-; RV64V-NEXT:    vsll.vi v12, v14, 3
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64V-NEXT:    vwmulu.vx v14, v12, a1
 ; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; RV64V-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f64:
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
+; RV32ZVE32F-NEXT:    vmv.v.x v10, a0
+; RV32ZVE32F-NEXT:    li a1, 8
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
-; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
+; RV32ZVE32F-NEXT:    vmv.x.s a0, v0
+; RV32ZVE32F-NEXT:    andi a2, a0, 1
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32ZVE32F-NEXT:    vwmaccu.vx v10, a1, v8
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB96_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
-; RV32ZVE32F-NEXT:    andi a0, a1, 2
-; RV32ZVE32F-NEXT:    bnez a0, .LBB96_10
+; RV32ZVE32F-NEXT:    andi a1, a0, 2
+; RV32ZVE32F-NEXT:    bnez a1, .LBB96_10
 ; RV32ZVE32F-NEXT:  .LBB96_2: # %else2
-; RV32ZVE32F-NEXT:    andi a0, a1, 4
-; RV32ZVE32F-NEXT:    bnez a0, .LBB96_11
+; RV32ZVE32F-NEXT:    andi a1, a0, 4
+; RV32ZVE32F-NEXT:    bnez a1, .LBB96_11
 ; RV32ZVE32F-NEXT:  .LBB96_3: # %else4
-; RV32ZVE32F-NEXT:    andi a0, a1, 8
-; RV32ZVE32F-NEXT:    bnez a0, .LBB96_12
+; RV32ZVE32F-NEXT:    andi a1, a0, 8
+; RV32ZVE32F-NEXT:    bnez a1, .LBB96_12
 ; RV32ZVE32F-NEXT:  .LBB96_4: # %else6
-; RV32ZVE32F-NEXT:    andi a0, a1, 16
-; RV32ZVE32F-NEXT:    bnez a0, .LBB96_13
+; RV32ZVE32F-NEXT:    andi a1, a0, 16
+; RV32ZVE32F-NEXT:    bnez a1, .LBB96_13
 ; RV32ZVE32F-NEXT:  .LBB96_5: # %else8
-; RV32ZVE32F-NEXT:    andi a0, a1, 32
-; RV32ZVE32F-NEXT:    bnez a0, .LBB96_14
+; RV32ZVE32F-NEXT:    andi a1, a0, 32
+; RV32ZVE32F-NEXT:    bnez a1, .LBB96_14
 ; RV32ZVE32F-NEXT:  .LBB96_6: # %else10
-; RV32ZVE32F-NEXT:    andi a0, a1, 64
-; RV32ZVE32F-NEXT:    bnez a0, .LBB96_15
+; RV32ZVE32F-NEXT:    andi a1, a0, 64
+; RV32ZVE32F-NEXT:    bnez a1, .LBB96_15
 ; RV32ZVE32F-NEXT:  .LBB96_7: # %else12
-; RV32ZVE32F-NEXT:    andi a0, a1, -128
+; RV32ZVE32F-NEXT:    andi a0, a0, -128
 ; RV32ZVE32F-NEXT:    bnez a0, .LBB96_16
 ; RV32ZVE32F-NEXT:  .LBB96_8: # %else14
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB96_9: # %cond.store
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    fsd fa0, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 2
-; RV32ZVE32F-NEXT:    beqz a0, .LBB96_2
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v10
+; RV32ZVE32F-NEXT:    fsd fa0, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 2
+; RV32ZVE32F-NEXT:    beqz a1, .LBB96_2
 ; RV32ZVE32F-NEXT:  .LBB96_10: # %cond.store1
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa1, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 4
-; RV32ZVE32F-NEXT:    beqz a0, .LBB96_3
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa1, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 4
+; RV32ZVE32F-NEXT:    beqz a1, .LBB96_3
 ; RV32ZVE32F-NEXT:  .LBB96_11: # %cond.store3
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa2, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 8
-; RV32ZVE32F-NEXT:    beqz a0, .LBB96_4
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 2
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa2, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 8
+; RV32ZVE32F-NEXT:    beqz a1, .LBB96_4
 ; RV32ZVE32F-NEXT:  .LBB96_12: # %cond.store5
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa3, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 16
-; RV32ZVE32F-NEXT:    beqz a0, .LBB96_5
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa3, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 16
+; RV32ZVE32F-NEXT:    beqz a1, .LBB96_5
 ; RV32ZVE32F-NEXT:  .LBB96_13: # %cond.store7
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa4, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 32
-; RV32ZVE32F-NEXT:    beqz a0, .LBB96_6
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 4
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa4, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 32
+; RV32ZVE32F-NEXT:    beqz a1, .LBB96_6
 ; RV32ZVE32F-NEXT:  .LBB96_14: # %cond.store9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa5, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, 64
-; RV32ZVE32F-NEXT:    beqz a0, .LBB96_7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 5
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa5, 0(a1)
+; RV32ZVE32F-NEXT:    andi a1, a0, 64
+; RV32ZVE32F-NEXT:    beqz a1, .LBB96_7
 ; RV32ZVE32F-NEXT:  .LBB96_15: # %cond.store11
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v10
-; RV32ZVE32F-NEXT:    fsd fa6, 0(a0)
-; RV32ZVE32F-NEXT:    andi a0, a1, -128
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 6
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    fsd fa6, 0(a1)
+; RV32ZVE32F-NEXT:    andi a0, a0, -128
 ; RV32ZVE32F-NEXT:    beqz a0, .LBB96_8
 ; RV32ZVE32F-NEXT:  .LBB96_16: # %cond.store13
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v10, 7
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    fsd fa7, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
@@ -11377,10 +11392,11 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32
 ;
 ; RV64V-LABEL: mscatter_baseidx_v8i32_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vsext.vf2 v16, v12
-; RV64V-NEXT:    vsll.vi v12, v16, 3
-; RV64V-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v16, v12, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i32_v8f64:
@@ -11576,10 +11592,11 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ;
 ; RV64V-LABEL: mscatter_baseidx_sext_v8i32_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vsext.vf2 v16, v12
-; RV64V-NEXT:    vsll.vi v12, v16, 3
-; RV64V-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulsu.vx v16, v12, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8f64:
@@ -11776,10 +11793,11 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ;
 ; RV64V-LABEL: mscatter_baseidx_zext_v8i32_v8f64:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64V-NEXT:    vzext.vf2 v16, v12
-; RV64V-NEXT:    vsll.vi v12, v16, 3
-; RV64V-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64V-NEXT:    li a1, 8
+; RV64V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64V-NEXT:    vwmulu.vx v16, v12, a1
+; RV64V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64V-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8f64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index df9ff0f..1f6513a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -736,18 +736,18 @@ define <8 x i32> @vpgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
 define <8 x i32> @vpgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v9, v8
-; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vzext.vf2 v9, v8
-; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vwmulu.vx v10, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v10, v0.t
 ; RV64-NEXT:    ret
@@ -760,11 +760,11 @@ define <8 x i32> @vpgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
 define <8 x i32> @vpgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v8, v10, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v8, (a0), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_v8i16_v8i32:
@@ -783,11 +783,11 @@ define <8 x i32> @vpgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x
 define <8 x i32> @vpgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v8, v10, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v8, (a0), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_v8i16_v8i32:
@@ -807,20 +807,20 @@ define <8 x i32> @vpgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs,
 define <8 x i32> @vpgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v8, v10, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v8, (a0), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i16_v8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v8, v10, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v10, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei32.v v8, (a0), v10, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i16> %idxs to <8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs
@@ -839,9 +839,9 @@ define <8 x i32> @vpgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m
 ;
 ; RV64-LABEL: vpgather_baseidx_v8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v12, v12, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -974,18 +974,18 @@ define <8 x i64> @vpgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 define <8 x i64> @vpgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v9, v8
-; RV32-NEXT:    vsll.vi v12, v9, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vzext.vf2 v9, v8
-; RV64-NEXT:    vsll.vi v12, v9, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -998,9 +998,9 @@ define <8 x i64> @vpgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 define <8 x i64> @vpgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_v8i16_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v12, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
@@ -1021,9 +1021,9 @@ define <8 x i64> @vpgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x
 define <8 x i64> @vpgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_v8i16_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v12, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
@@ -1045,18 +1045,18 @@ define <8 x i64> @vpgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs,
 define <8 x i64> @vpgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_v8i16_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v12, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i16_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v12, v10, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -1077,11 +1077,11 @@ define <8 x i64> @vpgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x
 ;
 ; RV64-LABEL: vpgather_baseidx_v8i32_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v8, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i64, ptr %base, <8 x i32> %idxs
   %v = call <8 x i64> @llvm.vp.gather.v8i64.v8p0(<8 x ptr> %ptrs, <8 x i1> %m, i32 %evl)
@@ -1099,11 +1099,11 @@ define <8 x i64> @vpgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs,
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_v8i32_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v8, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <8 x i32> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs
@@ -1122,11 +1122,11 @@ define <8 x i64> @vpgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs,
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i32_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v8, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i32> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs
@@ -1618,18 +1618,18 @@ define <8 x float> @vpgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs,
 define <8 x float> @vpgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v9, v8
-; RV32-NEXT:    vsll.vi v10, v9, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vzext.vf2 v9, v8
-; RV64-NEXT:    vsll.vi v10, v9, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vwmulu.vx v10, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v10, v0.t
 ; RV64-NEXT:    ret
@@ -1642,11 +1642,11 @@ define <8 x float> @vpgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs,
 define <8 x float> @vpgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v8, v10, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v8, (a0), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_v8i16_v8f32:
@@ -1665,11 +1665,11 @@ define <8 x float> @vpgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8
 define <8 x float> @vpgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v8, v10, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v8, (a0), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_v8i16_v8f32:
@@ -1689,20 +1689,20 @@ define <8 x float> @vpgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs
 define <8 x float> @vpgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v8, v10, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v8, (a0), v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i16_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v8, v10, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v10, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei32.v v8, (a0), v10, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i16> %idxs to <8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs
@@ -1721,9 +1721,9 @@ define <8 x float> @vpgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1>
 ;
 ; RV64-LABEL: vpgather_baseidx_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v12, v12, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -1856,18 +1856,18 @@ define <8 x double> @vpgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 define <8 x double> @vpgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v9, v8
-; RV32-NEXT:    vsll.vi v12, v9, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vzext.vf2 v9, v8
-; RV64-NEXT:    vsll.vi v12, v9, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -1880,9 +1880,9 @@ define <8 x double> @vpgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 define <8 x double> @vpgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_v8i16_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v12, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
@@ -1903,9 +1903,9 @@ define <8 x double> @vpgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 define <8 x double> @vpgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_v8i16_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v12, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
@@ -1927,18 +1927,18 @@ define <8 x double> @vpgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idx
 define <8 x double> @vpgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_v8i16_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v12, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i16_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v12, v10, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -1959,11 +1959,11 @@ define <8 x double> @vpgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8
 ;
 ; RV64-LABEL: vpgather_baseidx_v8i32_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v8, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i32> %idxs
   %v = call <8 x double> @llvm.vp.gather.v8f64.v8p0(<8 x ptr> %ptrs, <8 x i1> %m, i32 %evl)
@@ -1981,11 +1981,11 @@ define <8 x double> @vpgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idx
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_v8i32_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v8, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <8 x i32> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
@@ -2004,11 +2004,11 @@ define <8 x double> @vpgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idx
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v8i32_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v8, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i32> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
@@ -2209,25 +2209,25 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-LABEL: vpgather_baseidx_zext_v32i8_v32f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v8
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a4, 16
+; RV32-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v8, a3
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a3, .LBB97_2
+; RV32-NEXT:    bltu a1, a4, .LBB97_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB97_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2235,25 +2235,25 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-LABEL: vpgather_baseidx_zext_v32i8_v32f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a2, 32
-; RV64-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v8
-; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v16, v12, 3
+; RV64-NEXT:    li a3, 8
+; RV64-NEXT:    li a4, 16
+; RV64-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a3
 ; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    bltu a1, a3, .LBB97_2
+; RV64-NEXT:    bltu a1, a4, .LBB97_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB97_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    and a1, a1, a2
-; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    ret
@@ -2267,25 +2267,25 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV32-LABEL: vpgather_baseidx_v32i16_v32f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v8
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    vsll.vi v16, v16, 3
+; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a4, 16
+; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a3
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a3, .LBB98_2
+; RV32-NEXT:    bltu a1, a4, .LBB98_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB98_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2325,25 +2325,25 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-LABEL: vpgather_baseidx_sext_v32i16_v32f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v8
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    vsll.vi v16, v16, 3
+; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a4, 16
+; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a3
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a3, .LBB99_2
+; RV32-NEXT:    bltu a1, a4, .LBB99_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB99_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2385,25 +2385,25 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-LABEL: vpgather_baseidx_zext_v32i16_v32f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vzext.vf2 v16, v8
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    vsll.vi v16, v16, 3
+; RV32-NEXT:    li a3, 8
+; RV32-NEXT:    li a4, 16
+; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v8, a3
 ; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    bltu a1, a3, .LBB100_2
+; RV32-NEXT:    bltu a1, a4, .LBB100_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB100_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    ret
@@ -2411,25 +2411,25 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-LABEL: vpgather_baseidx_zext_v32i16_v32f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a2, 32
-; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v8
-; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v16, v16, 3
+; RV64-NEXT:    li a3, 8
+; RV64-NEXT:    li a4, 16
+; RV64-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a3
 ; RV64-NEXT:    mv a2, a1
-; RV64-NEXT:    bltu a1, a3, .LBB100_2
+; RV64-NEXT:    bltu a1, a4, .LBB100_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB100_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    and a1, a1, a2
-; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    ret
@@ -2468,20 +2468,19 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV64-LABEL: vpgather_baseidx_v32i32_v32f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v8, 16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v8
+; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    li a2, 8
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsext.vf2 v8, v16
-; RV64-NEXT:    vsll.vi v16, v8, 3
-; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v24, a2
+; RV64-NEXT:    vwmulsu.vx v24, v8, a2
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB101_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB101_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
@@ -2524,22 +2523,20 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_v32i32_v32f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    li a2, 8
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v24, a2
+; RV64-NEXT:    vwmulsu.vx v24, v8, a2
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB102_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB102_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
@@ -2583,22 +2580,20 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_v32i32_v32f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    li a2, 8
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v24, a2
+; RV64-NEXT:    vwmulu.vx v24, v8, a2
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB103_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:  .LBB103_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    vluxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index d691dcd..f7e4716 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -556,20 +556,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i
 define void @vpscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v11, v10
-; RV32-NEXT:    vsll.vi v10, v11, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v11, v10, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v11, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vzext.vf2 v11, v10
-; RV64-NEXT:    vsll.vi v10, v11, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vwmulu.vx v11, v10, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v11, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i8> %idxs to <8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs
@@ -580,11 +580,11 @@ define void @vpscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i
 define void @vpscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v10, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_v8i16_v8i32:
@@ -603,11 +603,11 @@ define void @vpscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16>
 define void @vpscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v10, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_v8i16_v8i32:
@@ -627,20 +627,20 @@ define void @vpscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x
 define void @vpscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_v8i16_v8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v10, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i16_v8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v10
-; RV64-NEXT:    vsll.vi v10, v12, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v10, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV64-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i16> %idxs to <8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs
@@ -659,9 +659,9 @@ define void @vpscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs,
 ;
 ; RV64-LABEL: vpscatter_baseidx_v8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v10
-; RV64-NEXT:    vsll.vi v12, v12, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v12, v10, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -790,20 +790,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i
 define void @vpscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v13, v12
-; RV32-NEXT:    vsll.vi v12, v13, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v13, v12, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v13, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vzext.vf2 v13, v12
-; RV64-NEXT:    vsll.vi v12, v13, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vwmulu.vx v13, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v13, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i8> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs
@@ -814,11 +814,11 @@ define void @vpscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i
 define void @vpscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_v8i16_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v14, v12
-; RV32-NEXT:    vsll.vi v12, v14, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v14, v12, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_v8i16_v8i64:
@@ -837,11 +837,11 @@ define void @vpscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16>
 define void @vpscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_v8i16_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v14, v12
-; RV32-NEXT:    vsll.vi v12, v14, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v14, v12, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_v8i16_v8i64:
@@ -861,20 +861,20 @@ define void @vpscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x
 define void @vpscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_v8i16_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v14, v12
-; RV32-NEXT:    vsll.vi v12, v14, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v14, v12, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i16_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v14, v12
-; RV64-NEXT:    vsll.vi v12, v14, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v14, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i16> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs
@@ -893,11 +893,11 @@ define void @vpscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32>
 ;
 ; RV64-LABEL: vpscatter_baseidx_v8i32_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v12, v16, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i64, ptr %base, <8 x i32> %idxs
   call void @llvm.vp.scatter.v8i64.v8p0(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m, i32 %evl)
@@ -915,11 +915,11 @@ define void @vpscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_v8i32_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v12, v16, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <8 x i32> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs
@@ -938,11 +938,11 @@ define void @vpscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i32_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v12, v16, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i32> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs
@@ -1323,20 +1323,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
 define void @vpscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v11, v10
-; RV32-NEXT:    vsll.vi v10, v11, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v11, v10, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v11, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vzext.vf2 v11, v10
-; RV64-NEXT:    vsll.vi v10, v11, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vwmulu.vx v11, v10, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v11, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i8> %idxs to <8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs
@@ -1347,11 +1347,11 @@ define void @vpscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
 define void @vpscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v10, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_v8i16_v8f32:
@@ -1370,11 +1370,11 @@ define void @vpscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16
 define void @vpscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v10, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_v8i16_v8f32:
@@ -1394,20 +1394,20 @@ define void @vpscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8
 define void @vpscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_v8i16_v8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v10
-; RV32-NEXT:    vsll.vi v10, v12, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v10, a2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i16_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v10
-; RV64-NEXT:    vsll.vi v10, v12, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v10, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; RV64-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i16> %idxs to <8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs
@@ -1426,9 +1426,9 @@ define void @vpscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idx
 ;
 ; RV64-LABEL: vpscatter_baseidx_v8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v10
-; RV64-NEXT:    vsll.vi v12, v12, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v12, v10, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -1557,20 +1557,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8
 define void @vpscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vzext.vf2 v13, v12
-; RV32-NEXT:    vsll.vi v12, v13, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vwmulu.vx v13, v12, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v13, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vzext.vf2 v13, v12
-; RV64-NEXT:    vsll.vi v12, v13, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vwmulu.vx v13, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v13, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i8> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
@@ -1581,11 +1581,11 @@ define void @vpscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8
 define void @vpscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_v8i16_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v14, v12
-; RV32-NEXT:    vsll.vi v12, v14, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v14, v12, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_v8i16_v8f64:
@@ -1604,11 +1604,11 @@ define void @vpscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i1
 define void @vpscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_v8i16_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v14, v12
-; RV32-NEXT:    vsll.vi v12, v14, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulsu.vx v14, v12, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_v8i16_v8f64:
@@ -1628,20 +1628,20 @@ define void @vpscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 define void @vpscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_v8i16_v8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v14, v12
-; RV32-NEXT:    vsll.vi v12, v14, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v14, v12, a2
 ; RV32-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i16_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v14, v12
-; RV64-NEXT:    vsll.vi v12, v14, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v14, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei32.v v8, (a0), v14, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i16> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
@@ -1660,11 +1660,11 @@ define void @vpscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i3
 ;
 ; RV64-LABEL: vpscatter_baseidx_v8i32_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v12, v16, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i32> %idxs
   call void @llvm.vp.scatter.v8f64.v8p0(<8 x double> %val, <8 x ptr> %ptrs, <8 x i1> %m, i32 %evl)
@@ -1682,11 +1682,11 @@ define void @vpscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_v8i32_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v12, v16, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <8 x i32> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
@@ -1705,11 +1705,11 @@ define void @vpscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_v8i32_v8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v12, v16, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v12, a2
 ; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <8 x i32> %idxs to <8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
@@ -1842,36 +1842,35 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a3, 3
-; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
 ; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    li a1, 8
 ; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v24, 16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v0, v24
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v8, v16, a1
+; RV64-NEXT:    vwmulsu.vx v16, v24, a1
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB84_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB84_2:
 ; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
+; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
@@ -1879,14 +1878,14 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 3
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -1942,24 +1941,22 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    li a1, 8
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v24
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v24, 16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v8
-; RV64-NEXT:    vsll.vi v8, v24, 3
-; RV64-NEXT:    vsll.vi v24, v16, 3
+; RV64-NEXT:    vslidedown.vi v16, v24, 16
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v8, v16, a1
+; RV64-NEXT:    vwmulsu.vx v16, v24, a1
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB85_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB85_2:
 ; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (a0), v24, v0.t
+; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
@@ -2031,24 +2028,22 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    li a1, 8
 ; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v24
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v24, 16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v8
-; RV64-NEXT:    vsll.vi v8, v24, 3
-; RV64-NEXT:    vsll.vi v24, v16, 3
+; RV64-NEXT:    vslidedown.vi v16, v24, 16
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v8, v16, a1
+; RV64-NEXT:    vwmulu.vx v16, v24, a1
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB86_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB86_2:
 ; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (a0), v24, v0.t
+; RV64-NEXT:    vsoxei64.v v24, (a0), v16, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
index fce2284..7b12a56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
@@ -225,9 +225,10 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) {
 define <4 x i64> @vwsll_vi_v4i64(<4 x i32> %a) {
 ; CHECK-LABEL: vwsll_vi_v4i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a0
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v4i64:
@@ -434,9 +435,10 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) {
 define <8 x i32> @vwsll_vi_v8i32(<8 x i16> %a) {
 ; CHECK-LABEL: vwsll_vi_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a0
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v8i32:
@@ -654,9 +656,10 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) {
 define <16 x i16> @vwsll_vi_v16i16(<16 x i8> %a) {
 ; CHECK-LABEL: vwsll_vi_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a0
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v16i16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 9ee2324..0fad09f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -775,11 +775,11 @@ define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i8_nxv8i32(ptr %base, <vscal
 define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i8_nxv8i32(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
 ; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vluxei16.v v12, (a0), v8, v0.t
+; CHECK-NEXT:    vluxei16.v v12, (a0), v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
@@ -791,10 +791,11 @@ define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i8_nxv8i32(ptr %base, <vscal
 define <vscale x 8 x i32> @mgather_baseidx_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
 ; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT:    vsext.vf2 v16, v8
-; RV32-NEXT:    vsll.vi v8, v16, 2
-; RV32-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT:    vluxei32.v v12, (a0), v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
@@ -815,10 +816,11 @@ define <vscale x 8 x i32> @mgather_baseidx_nxv8i16_nxv8i32(ptr %base, <vscale x
 define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT:    vsext.vf2 v16, v8
-; RV32-NEXT:    vsll.vi v8, v16, 2
-; RV32-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT:    vluxei32.v v12, (a0), v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
@@ -840,10 +842,11 @@ define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i16_nxv8i32(ptr %base, <vsca
 define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
 ; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vsll.vi v8, v16, 2
-; CHECK-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwmulu.vx v16, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v12, (a0), v16, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
@@ -863,10 +866,9 @@ define <vscale x 8 x i32> @mgather_baseidx_nxv8i32(ptr %base, <vscale x 8 x i32>
 ;
 ; RV64-LABEL: mgather_baseidx_nxv8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV64-NEXT:    li a1, 4
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, mu
+; RV64-NEXT:    vwmulsu.vx v16, v8, a1
 ; RV64-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret
@@ -1034,11 +1036,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i8_nxv8i64(ptr %base, <vscal
 define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i8_nxv8i64(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
 ; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 3
+; CHECK-NEXT:    li a1, 8
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vluxei16.v v16, (a0), v8, v0.t
+; CHECK-NEXT:    vluxei16.v v16, (a0), v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
@@ -1050,11 +1052,11 @@ define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i8_nxv8i64(ptr %base, <vscal
 define <vscale x 8 x i64> @mgather_baseidx_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT:    vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v16, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
@@ -1074,11 +1076,11 @@ define <vscale x 8 x i64> @mgather_baseidx_nxv8i16_nxv8i64(ptr %base, <vscale x
 define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT:    vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v16, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
@@ -1099,11 +1101,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i16_nxv8i64(ptr %base, <vsca
 define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
 ; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vsll.vi v8, v12, 3
+; CHECK-NEXT:    li a1, 8
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwmulu.vx v12, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vluxei32.v v16, (a0), v8, v0.t
+; CHECK-NEXT:    vluxei32.v v16, (a0), v12, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
@@ -1124,10 +1126,11 @@ define <vscale x 8 x i64> @mgather_baseidx_nxv8i32_nxv8i64(ptr %base, <vscale x
 ;
 ; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vsext.vf2 v24, v8
-; RV64-NEXT:    vsll.vi v8, v24, 3
-; RV64-NEXT:    vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v8, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vluxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i32> %idxs
@@ -1147,10 +1150,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i32_nxv8i64(ptr %base, <vsca
 ;
 ; RV64-LABEL: mgather_baseidx_sext_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vsext.vf2 v24, v8
-; RV64-NEXT:    vsll.vi v8, v24, 3
-; RV64-NEXT:    vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v8, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vluxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
@@ -1171,10 +1175,11 @@ define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i32_nxv8i64(ptr %base, <vsca
 ;
 ; RV64-LABEL: mgather_baseidx_zext_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vzext.vf2 v24, v8
-; RV64-NEXT:    vsll.vi v8, v24, 3
-; RV64-NEXT:    vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v8, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vluxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
@@ -1845,11 +1850,11 @@ define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i8_nxv8f32(ptr %base, <vsc
 define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i8_nxv8f32(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
 ; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vluxei16.v v12, (a0), v8, v0.t
+; CHECK-NEXT:    vluxei16.v v12, (a0), v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
@@ -1861,10 +1866,11 @@ define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i8_nxv8f32(ptr %base, <vsc
 define <vscale x 8 x float> @mgather_baseidx_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
 ; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT:    vsext.vf2 v16, v8
-; RV32-NEXT:    vsll.vi v8, v16, 2
-; RV32-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT:    vluxei32.v v12, (a0), v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
@@ -1885,10 +1891,11 @@ define <vscale x 8 x float> @mgather_baseidx_nxv8i16_nxv8f32(ptr %base, <vscale
 define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT:    vsext.vf2 v16, v8
-; RV32-NEXT:    vsll.vi v8, v16, 2
-; RV32-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT:    vluxei32.v v12, (a0), v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
@@ -1910,10 +1917,11 @@ define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i16_nxv8f32(ptr %base, <vs
 define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
 ; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
-; CHECK-NEXT:    vzext.vf2 v16, v8
-; CHECK-NEXT:    vsll.vi v8, v16, 2
-; CHECK-NEXT:    vluxei32.v v12, (a0), v8, v0.t
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwmulu.vx v16, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v12, (a0), v16, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
@@ -1933,10 +1941,9 @@ define <vscale x 8 x float> @mgather_baseidx_nxv8f32(ptr %base, <vscale x 8 x i3
 ;
 ; RV64-LABEL: mgather_baseidx_nxv8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV64-NEXT:    li a1, 4
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, mu
+; RV64-NEXT:    vwmulsu.vx v16, v8, a1
 ; RV64-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret
@@ -2104,11 +2111,11 @@ define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i8_nxv8f64(ptr %base, <vs
 define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i8_nxv8f64(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
 ; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 3
+; CHECK-NEXT:    li a1, 8
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vluxei16.v v16, (a0), v8, v0.t
+; CHECK-NEXT:    vluxei16.v v16, (a0), v10, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
@@ -2120,11 +2127,11 @@ define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i8_nxv8f64(ptr %base, <vs
 define <vscale x 8 x double> @mgather_baseidx_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT:    vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v16, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
@@ -2144,11 +2151,11 @@ define <vscale x 8 x double> @mgather_baseidx_nxv8i16_nxv8f64(ptr %base, <vscale
 define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
 ; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT:    vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT:    vluxei32.v v16, (a0), v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
@@ -2169,11 +2176,11 @@ define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i16_nxv8f64(ptr %base, <v
 define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
 ; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v12, v8
-; CHECK-NEXT:    vsll.vi v8, v12, 3
+; CHECK-NEXT:    li a1, 8
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwmulu.vx v12, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vluxei32.v v16, (a0), v8, v0.t
+; CHECK-NEXT:    vluxei32.v v16, (a0), v12, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
@@ -2194,10 +2201,11 @@ define <vscale x 8 x double> @mgather_baseidx_nxv8i32_nxv8f64(ptr %base, <vscale
 ;
 ; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vsext.vf2 v24, v8
-; RV64-NEXT:    vsll.vi v8, v24, 3
-; RV64-NEXT:    vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v8, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vluxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i32> %idxs
@@ -2217,10 +2225,11 @@ define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i32_nxv8f64(ptr %base, <v
 ;
 ; RV64-LABEL: mgather_baseidx_sext_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vsext.vf2 v24, v8
-; RV64-NEXT:    vsll.vi v8, v24, 3
-; RV64-NEXT:    vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v8, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vluxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
@@ -2241,10 +2250,11 @@ define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i32_nxv8f64(ptr %base, <v
 ;
 ; RV64-LABEL: mgather_baseidx_zext_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vzext.vf2 v24, v8
-; RV64-NEXT:    vsll.vi v8, v24, 3
-; RV64-NEXT:    vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v8, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vluxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    vmv.v.v v8, v16
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
index 77a1f50..3cf7cc9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
@@ -581,11 +581,11 @@ define void @mscatter_baseidx_sext_nxv8i8_nxv8i32(<vscale x 8 x i32> %val, ptr %
 define void @mscatter_baseidx_zext_nxv8i8_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m) {
 ; CHECK-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v14, v12
-; CHECK-NEXT:    vsll.vi v12, v14, 2
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v14, v12, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <vscale x 8 x i32> %eidxs
@@ -596,10 +596,11 @@ define void @mscatter_baseidx_zext_nxv8i8_nxv8i32(<vscale x 8 x i32> %val, ptr %
 define void @mscatter_baseidx_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v12, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8i32:
@@ -618,10 +619,11 @@ define void @mscatter_baseidx_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr %base
 define void @mscatter_baseidx_sext_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v12, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i32:
@@ -641,10 +643,11 @@ define void @mscatter_baseidx_sext_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr
 define void @mscatter_baseidx_zext_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; CHECK-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v12
-; CHECK-NEXT:    vsll.vi v12, v16, 2
-; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwmulu.vx v16, v12, a1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <vscale x 8 x i32> %eidxs
@@ -662,10 +665,9 @@ define void @mscatter_baseidx_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscal
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    li a1, 4
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v12, a1
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i32, ptr %base, <vscale x 8 x i32> %idxs
@@ -817,11 +819,11 @@ define void @mscatter_baseidx_sext_nxv8i8_nxv8i64(<vscale x 8 x i64> %val, ptr %
 define void @mscatter_baseidx_zext_nxv8i8_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m) {
 ; CHECK-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v18, v16
-; CHECK-NEXT:    vsll.vi v16, v18, 3
+; CHECK-NEXT:    li a1, 8
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v18, v16, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v18, v0.t
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -832,11 +834,11 @@ define void @mscatter_baseidx_zext_nxv8i8_nxv8i64(<vscale x 8 x i64> %val, ptr %
 define void @mscatter_baseidx_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8i64:
@@ -854,11 +856,11 @@ define void @mscatter_baseidx_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr %base
 define void @mscatter_baseidx_sext_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i64:
@@ -877,11 +879,11 @@ define void @mscatter_baseidx_sext_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr
 define void @mscatter_baseidx_zext_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; CHECK-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v20, v16
-; CHECK-NEXT:    vsll.vi v16, v20, 3
+; CHECK-NEXT:    li a1, 8
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwmulu.vx v20, v16, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -900,10 +902,11 @@ define void @mscatter_baseidx_nxv8i32_nxv8i64(<vscale x 8 x i64> %val, ptr %base
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i32> %idxs
   call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> %val, <vscale x 8 x ptr> %ptrs, i32 8, <vscale x 8 x i1> %m)
@@ -921,10 +924,11 @@ define void @mscatter_baseidx_sext_nxv8i32_nxv8i64(<vscale x 8 x i64> %val, ptr
 ;
 ; RV64-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -943,10 +947,11 @@ define void @mscatter_baseidx_zext_nxv8i32_nxv8i64(<vscale x 8 x i64> %val, ptr
 ;
 ; RV64-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1475,11 +1480,11 @@ define void @mscatter_baseidx_sext_nxv8i8_nxv8f32(<vscale x 8 x float> %val, ptr
 define void @mscatter_baseidx_zext_nxv8i8_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m) {
 ; CHECK-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v14, v12
-; CHECK-NEXT:    vsll.vi v12, v14, 2
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v14, v12, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v14, v0.t
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <vscale x 8 x i32> %eidxs
@@ -1490,10 +1495,11 @@ define void @mscatter_baseidx_zext_nxv8i8_nxv8f32(<vscale x 8 x float> %val, ptr
 define void @mscatter_baseidx_nxv8i16_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v12, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8f32:
@@ -1512,10 +1518,11 @@ define void @mscatter_baseidx_nxv8i16_nxv8f32(<vscale x 8 x float> %val, ptr %ba
 define void @mscatter_baseidx_sext_nxv8i16_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v12, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f32:
@@ -1535,10 +1542,11 @@ define void @mscatter_baseidx_sext_nxv8i16_nxv8f32(<vscale x 8 x float> %val, pt
 define void @mscatter_baseidx_zext_nxv8i16_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; CHECK-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v16, v12
-; CHECK-NEXT:    vsll.vi v12, v16, 2
-; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwmulu.vx v16, v12, a1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <vscale x 8 x i32> %eidxs
@@ -1556,10 +1564,9 @@ define void @mscatter_baseidx_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vsc
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    li a1, 4
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v12, a1
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds float, ptr %base, <vscale x 8 x i32> %idxs
@@ -1711,11 +1718,11 @@ define void @mscatter_baseidx_sext_nxv8i8_nxv8f64(<vscale x 8 x double> %val, pt
 define void @mscatter_baseidx_zext_nxv8i8_nxv8f64(<vscale x 8 x double> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m) {
 ; CHECK-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v18, v16
-; CHECK-NEXT:    vsll.vi v16, v18, 3
+; CHECK-NEXT:    li a1, 8
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v18, v16, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v18, v0.t
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1726,11 +1733,11 @@ define void @mscatter_baseidx_zext_nxv8i8_nxv8f64(<vscale x 8 x double> %val, pt
 define void @mscatter_baseidx_nxv8i16_nxv8f64(<vscale x 8 x double> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8f64:
@@ -1748,11 +1755,11 @@ define void @mscatter_baseidx_nxv8i16_nxv8f64(<vscale x 8 x double> %val, ptr %b
 define void @mscatter_baseidx_sext_nxv8i16_nxv8f64(<vscale x 8 x double> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a1
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f64:
@@ -1771,11 +1778,11 @@ define void @mscatter_baseidx_sext_nxv8i16_nxv8f64(<vscale x 8 x double> %val, p
 define void @mscatter_baseidx_zext_nxv8i16_nxv8f64(<vscale x 8 x double> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m) {
 ; CHECK-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vzext.vf2 v20, v16
-; CHECK-NEXT:    vsll.vi v16, v20, 3
+; CHECK-NEXT:    li a1, 8
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwmulu.vx v20, v16, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; CHECK-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1794,10 +1801,11 @@ define void @mscatter_baseidx_nxv8i32_nxv8f64(<vscale x 8 x double> %val, ptr %b
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i32> %idxs
   call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> %val, <vscale x 8 x ptr> %ptrs, i32 8, <vscale x 8 x i1> %m)
@@ -1815,10 +1823,11 @@ define void @mscatter_baseidx_sext_nxv8i32_nxv8f64(<vscale x 8 x double> %val, p
 ;
 ; RV64-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1837,10 +1846,11 @@ define void @mscatter_baseidx_zext_nxv8i32_nxv8f64(<vscale x 8 x double> %val, p
 ;
 ; RV64-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a1, 8
+; RV64-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -2015,15 +2025,15 @@ define void @mscatter_baseidx_nxv16i16_nxv16f64(<vscale x 8 x double> %val0, <vs
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vslidedown.vx v7, v0, a1
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v24, v8
-; RV32-NEXT:    vsll.vi v8, v24, 3
+; RV32-NEXT:    li a1, 8
+; RV32-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v8, a1
 ; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT:    vsoxei32.v v16, (a0), v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsoxei32.v v16, (a0), v12, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsoxei32.v v8, (a0), v28, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index abe7bda..1007d1c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -799,18 +799,18 @@ define <vscale x 8 x i32> @vpgather_baseidx_sext_nxv8i8_nxv8i32(ptr %base, <vsca
 define <vscale x 8 x i32> @vpgather_baseidx_zext_nxv8i8_nxv8i32(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -823,10 +823,11 @@ define <vscale x 8 x i32> @vpgather_baseidx_zext_nxv8i8_nxv8i32(ptr %base, <vsca
 define <vscale x 8 x i32> @vpgather_baseidx_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv8i16_nxv8i32:
@@ -845,10 +846,11 @@ define <vscale x 8 x i32> @vpgather_baseidx_nxv8i16_nxv8i32(ptr %base, <vscale x
 define <vscale x 8 x i32> @vpgather_baseidx_sext_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_nxv8i16_nxv8i32:
@@ -868,18 +870,20 @@ define <vscale x 8 x i32> @vpgather_baseidx_sext_nxv8i16_nxv8i32(ptr %base, <vsc
 define <vscale x 8 x i32> @vpgather_baseidx_zext_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i16_nxv8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v8, v12, 2
-; RV64-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <vscale x 8 x i32> %eidxs
@@ -897,10 +901,9 @@ define <vscale x 8 x i32> @vpgather_baseidx_nxv8i32(ptr %base, <vscale x 8 x i32
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v8, a2
 ; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i32, ptr %base, <vscale x 8 x i32> %idxs
@@ -1049,18 +1052,18 @@ define <vscale x 8 x i64> @vpgather_baseidx_sext_nxv8i8_nxv8i64(ptr %base, <vsca
 define <vscale x 8 x i64> @vpgather_baseidx_zext_nxv8i8_nxv8i64(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v16, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v16, v10, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -1073,9 +1076,9 @@ define <vscale x 8 x i64> @vpgather_baseidx_zext_nxv8i8_nxv8i64(ptr %base, <vsca
 define <vscale x 8 x i64> @vpgather_baseidx_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
@@ -1095,9 +1098,9 @@ define <vscale x 8 x i64> @vpgather_baseidx_nxv8i16_nxv8i64(ptr %base, <vscale x
 define <vscale x 8 x i64> @vpgather_baseidx_sext_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
@@ -1118,18 +1121,18 @@ define <vscale x 8 x i64> @vpgather_baseidx_sext_nxv8i16_nxv8i64(ptr %base, <vsc
 define <vscale x 8 x i64> @vpgather_baseidx_zext_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i16_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v16, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -1150,10 +1153,11 @@ define <vscale x 8 x i64> @vpgather_baseidx_nxv8i32_nxv8i64(ptr %base, <vscale x
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i32> %idxs
   %v = call <vscale x 8 x i64> @llvm.vp.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
@@ -1171,10 +1175,11 @@ define <vscale x 8 x i64> @vpgather_baseidx_sext_nxv8i32_nxv8i64(ptr %base, <vsc
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1193,10 +1198,11 @@ define <vscale x 8 x i64> @vpgather_baseidx_zext_nxv8i32_nxv8i64(ptr %base, <vsc
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1742,18 +1748,18 @@ define <vscale x 8 x float> @vpgather_baseidx_sext_nxv8i8_nxv8f32(ptr %base, <vs
 define <vscale x 8 x float> @vpgather_baseidx_zext_nxv8i8_nxv8f32(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v12, v10, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v12, v10, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
 ; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
@@ -1766,10 +1772,11 @@ define <vscale x 8 x float> @vpgather_baseidx_zext_nxv8i8_nxv8f32(ptr %base, <vs
 define <vscale x 8 x float> @vpgather_baseidx_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv8i16_nxv8f32:
@@ -1788,10 +1795,11 @@ define <vscale x 8 x float> @vpgather_baseidx_nxv8i16_nxv8f32(ptr %base, <vscale
 define <vscale x 8 x float> @vpgather_baseidx_sext_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v12, v8, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_nxv8i16_nxv8f32:
@@ -1811,18 +1819,20 @@ define <vscale x 8 x float> @vpgather_baseidx_sext_nxv8i16_nxv8f32(ptr %base, <v
 define <vscale x 8 x float> @vpgather_baseidx_zext_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v8, v12, 2
-; RV32-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v12, v8, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i16_nxv8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v8, v12, 2
-; RV64-NEXT:    vluxei32.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v12, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    vluxei32.v v8, (a0), v12, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <vscale x 8 x i32> %eidxs
@@ -1840,10 +1850,9 @@ define <vscale x 8 x float> @vpgather_baseidx_nxv8f32(ptr %base, <vscale x 8 x i
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v8, a2
 ; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds float, ptr %base, <vscale x 8 x i32> %idxs
@@ -1992,18 +2001,18 @@ define <vscale x 6 x double> @vpgather_baseidx_sext_nxv6i8_nxv6f64(ptr %base, <v
 define <vscale x 6 x double> @vpgather_baseidx_zext_nxv6i8_nxv6f64(ptr %base, <vscale x 6 x i8> %idxs, <vscale x 6 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv6i8_nxv6f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v16, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv6i8_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v16, v10, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2016,9 +2025,9 @@ define <vscale x 6 x double> @vpgather_baseidx_zext_nxv6i8_nxv6f64(ptr %base, <v
 define <vscale x 6 x double> @vpgather_baseidx_nxv6i16_nxv6f64(ptr %base, <vscale x 6 x i16> %idxs, <vscale x 6 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv6i16_nxv6f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
@@ -2038,9 +2047,9 @@ define <vscale x 6 x double> @vpgather_baseidx_nxv6i16_nxv6f64(ptr %base, <vscal
 define <vscale x 6 x double> @vpgather_baseidx_sext_nxv6i16_nxv6f64(ptr %base, <vscale x 6 x i16> %idxs, <vscale x 6 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_nxv6i16_nxv6f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
@@ -2061,18 +2070,18 @@ define <vscale x 6 x double> @vpgather_baseidx_sext_nxv6i16_nxv6f64(ptr %base, <
 define <vscale x 6 x double> @vpgather_baseidx_zext_nxv6i16_nxv6f64(ptr %base, <vscale x 6 x i16> %idxs, <vscale x 6 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv6i16_nxv6f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv6i16_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v16, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2093,10 +2102,11 @@ define <vscale x 6 x double> @vpgather_baseidx_nxv6i32_nxv6f64(ptr %base, <vscal
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv6i32_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 6 x i32> %idxs
   %v = call <vscale x 6 x double> @llvm.vp.gather.nxv6f64.nxv6p0(<vscale x 6 x ptr> %ptrs, <vscale x 6 x i1> %m, i32 %evl)
@@ -2114,10 +2124,11 @@ define <vscale x 6 x double> @vpgather_baseidx_sext_nxv6i32_nxv6f64(ptr %base, <
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_nxv6i32_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 6 x i32> %idxs to <vscale x 6 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 6 x i64> %eidxs
@@ -2136,10 +2147,11 @@ define <vscale x 6 x double> @vpgather_baseidx_zext_nxv6i32_nxv6f64(ptr %base, <
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv6i32_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 6 x i32> %idxs to <vscale x 6 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 6 x i64> %eidxs
@@ -2235,18 +2247,18 @@ define <vscale x 8 x double> @vpgather_baseidx_sext_nxv8i8_nxv8f64(ptr %base, <v
 define <vscale x 8 x double> @vpgather_baseidx_zext_nxv8i8_nxv8f64(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v10, v8
-; RV32-NEXT:    vsll.vi v16, v10, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i8_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v10, v8
-; RV64-NEXT:    vsll.vi v16, v10, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2259,9 +2271,9 @@ define <vscale x 8 x double> @vpgather_baseidx_zext_nxv8i8_nxv8f64(ptr %base, <v
 define <vscale x 8 x double> @vpgather_baseidx_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
@@ -2281,9 +2293,9 @@ define <vscale x 8 x double> @vpgather_baseidx_nxv8i16_nxv8f64(ptr %base, <vscal
 define <vscale x 8 x double> @vpgather_baseidx_sext_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
@@ -2304,18 +2316,18 @@ define <vscale x 8 x double> @vpgather_baseidx_sext_nxv8i16_nxv8f64(ptr %base, <
 define <vscale x 8 x double> @vpgather_baseidx_zext_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v12, v8
-; RV32-NEXT:    vsll.vi v16, v12, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v8, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i16_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v12, v8
-; RV64-NEXT:    vsll.vi v16, v12, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
@@ -2336,10 +2348,11 @@ define <vscale x 8 x double> @vpgather_baseidx_nxv8i32_nxv8f64(ptr %base, <vscal
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i32> %idxs
   %v = call <vscale x 8 x double> @llvm.vp.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
@@ -2357,10 +2370,11 @@ define <vscale x 8 x double> @vpgather_baseidx_sext_nxv8i32_nxv8f64(ptr %base, <
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -2379,10 +2393,11 @@ define <vscale x 8 x double> @vpgather_baseidx_zext_nxv8i32_nxv8f64(ptr %base, <
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v8, v16, 3
-; RV64-NEXT:    vluxei64.v v8, (a0), v8, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v8, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -2465,27 +2480,26 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v12, v0
-; RV32-NEXT:    vsext.vf2 v16, v8
+; RV32-NEXT:    li a3, 8
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vsll.vi v24, v16, 3
-; RV32-NEXT:    sub a3, a1, a2
-; RV32-NEXT:    srli a4, a2, 3
-; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a4
-; RV32-NEXT:    sltu a4, a1, a3
-; RV32-NEXT:    addi a4, a4, -1
-; RV32-NEXT:    and a3, a4, a3
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v8, a3
+; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:    bltu a1, a2, .LBB112_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:  .LBB112_2:
-; RV32-NEXT:    vmv1r.v v0, v12
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    srli a2, a2, 3
+; RV32-NEXT:    sltu a1, a1, a3
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_nxv16i16_nxv16f64:
@@ -2523,27 +2537,26 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v12, v0
-; RV32-NEXT:    vsext.vf2 v16, v8
+; RV32-NEXT:    li a3, 8
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vsll.vi v24, v16, 3
-; RV32-NEXT:    sub a3, a1, a2
-; RV32-NEXT:    srli a4, a2, 3
-; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a4
-; RV32-NEXT:    sltu a4, a1, a3
-; RV32-NEXT:    addi a4, a4, -1
-; RV32-NEXT:    and a3, a4, a3
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v8, a3
+; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:    bltu a1, a2, .LBB113_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:  .LBB113_2:
-; RV32-NEXT:    vmv1r.v v0, v12
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    srli a2, a2, 3
+; RV32-NEXT:    sltu a1, a1, a3
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64:
@@ -2582,52 +2595,50 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv1r.v v12, v0
-; RV32-NEXT:    vzext.vf2 v16, v8
+; RV32-NEXT:    li a3, 8
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vsll.vi v24, v16, 3
-; RV32-NEXT:    sub a3, a1, a2
-; RV32-NEXT:    srli a4, a2, 3
-; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a4
-; RV32-NEXT:    sltu a4, a1, a3
-; RV32-NEXT:    addi a4, a4, -1
-; RV32-NEXT:    and a3, a4, a3
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulu.vx v24, v8, a3
+; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:    bltu a1, a2, .LBB114_2
 ; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:  .LBB114_2:
-; RV32-NEXT:    vmv1r.v v0, v12
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT:    sub a3, a1, a2
+; RV32-NEXT:    srli a2, a2, 3
+; RV32-NEXT:    sltu a1, a1, a3
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a2
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vluxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmv1r.v v12, v0
-; RV64-NEXT:    vzext.vf2 v16, v8
+; RV64-NEXT:    li a3, 8
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vsll.vi v24, v16, 3
-; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    srli a4, a2, 3
-; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a4
-; RV64-NEXT:    sltu a4, a1, a3
-; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    and a3, a4, a3
-; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT:    vluxei32.v v16, (a0), v28, v0.t
+; RV64-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v8, a3
+; RV64-NEXT:    mv a3, a1
 ; RV64-NEXT:    bltu a1, a2, .LBB114_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:  .LBB114_2:
-; RV64-NEXT:    vmv1r.v v0, v12
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v24, v0.t
+; RV64-NEXT:    sub a3, a1, a2
+; RV64-NEXT:    srli a2, a2, 3
+; RV64-NEXT:    sltu a1, a1, a3
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
+; RV64-NEXT:    and a1, a1, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vluxei32.v v16, (a0), v28, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 16 x i16> %idxs to <vscale x 16 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i64> %eidxs
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 647e396..2cf6248 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -659,20 +659,20 @@ define void @vpscatter_baseidx_sext_nxv8i8_nxv8i32(<vscale x 8 x i32> %val, ptr
 define void @vpscatter_baseidx_zext_nxv8i8_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v14, v12
-; RV32-NEXT:    vsll.vi v12, v14, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v14, v12, a2
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v14, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v14, v12
-; RV64-NEXT:    vsll.vi v12, v14, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v14, v12, a2
 ; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v14, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <vscale x 8 x i32> %eidxs
@@ -683,10 +683,11 @@ define void @vpscatter_baseidx_zext_nxv8i8_nxv8i32(<vscale x 8 x i32> %val, ptr
 define void @vpscatter_baseidx_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v12, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv8i16_nxv8i32:
@@ -705,10 +706,11 @@ define void @vpscatter_baseidx_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr %bas
 define void @vpscatter_baseidx_sext_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v12, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv8i16_nxv8i32:
@@ -728,18 +730,20 @@ define void @vpscatter_baseidx_sext_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr
 define void @vpscatter_baseidx_zext_nxv8i16_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv8i16_nxv8i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v12, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i16_nxv8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v12, v16, 2
-; RV64-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v12, a2
+; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds i32, ptr %base, <vscale x 8 x i32> %eidxs
@@ -757,10 +761,9 @@ define void @vpscatter_baseidx_nxv8i32(<vscale x 8 x i32> %val, ptr %base, <vsca
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv8i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v12, a2
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i32, ptr %base, <vscale x 8 x i32> %idxs
@@ -904,20 +907,20 @@ define void @vpscatter_baseidx_sext_nxv8i8_nxv8i64(<vscale x 8 x i64> %val, ptr
 define void @vpscatter_baseidx_zext_nxv8i8_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v18, v16
-; RV32-NEXT:    vsll.vi v16, v18, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v18, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v18, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v18, v16
-; RV64-NEXT:    vsll.vi v16, v18, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v18, v16, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v18, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -928,11 +931,11 @@ define void @vpscatter_baseidx_zext_nxv8i8_nxv8i64(<vscale x 8 x i64> %val, ptr
 define void @vpscatter_baseidx_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv8i16_nxv8i64:
@@ -950,11 +953,11 @@ define void @vpscatter_baseidx_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr %bas
 define void @vpscatter_baseidx_sext_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv8i16_nxv8i64:
@@ -973,20 +976,20 @@ define void @vpscatter_baseidx_sext_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr
 define void @vpscatter_baseidx_zext_nxv8i16_nxv8i64(<vscale x 8 x i64> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv8i16_nxv8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i16_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v20, v16
-; RV64-NEXT:    vsll.vi v16, v20, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v20, v16, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1005,10 +1008,11 @@ define void @vpscatter_baseidx_nxv8i32_nxv8i64(<vscale x 8 x i64> %val, ptr %bas
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i32> %idxs
   call void @llvm.vp.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
@@ -1026,10 +1030,11 @@ define void @vpscatter_baseidx_sext_nxv8i32_nxv8i64(<vscale x 8 x i64> %val, ptr
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1048,10 +1053,11 @@ define void @vpscatter_baseidx_zext_nxv8i32_nxv8i64(<vscale x 8 x i64> %val, ptr
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i32_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i64> %eidxs
@@ -1572,20 +1578,20 @@ define void @vpscatter_baseidx_sext_nxv8i8_nxv8f32(<vscale x 8 x float> %val, pt
 define void @vpscatter_baseidx_zext_nxv8i8_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v14, v12
-; RV32-NEXT:    vsll.vi v12, v14, 2
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v14, v12, a2
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v14, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v14, v12
-; RV64-NEXT:    vsll.vi v12, v14, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v14, v12, a2
 ; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v14, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <vscale x 8 x i32> %eidxs
@@ -1596,10 +1602,11 @@ define void @vpscatter_baseidx_zext_nxv8i8_nxv8f32(<vscale x 8 x float> %val, pt
 define void @vpscatter_baseidx_nxv8i16_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v12, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv8i16_nxv8f32:
@@ -1618,10 +1625,11 @@ define void @vpscatter_baseidx_nxv8i16_nxv8f32(<vscale x 8 x float> %val, ptr %b
 define void @vpscatter_baseidx_sext_nxv8i16_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v16, v12, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv8i16_nxv8f32:
@@ -1641,18 +1649,20 @@ define void @vpscatter_baseidx_sext_nxv8i16_nxv8f32(<vscale x 8 x float> %val, p
 define void @vpscatter_baseidx_zext_nxv8i16_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv8i16_nxv8f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v16, v12
-; RV32-NEXT:    vsll.vi v12, v16, 2
-; RV32-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v16, v12, a2
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i16_nxv8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v12, v16, 2
-; RV64-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v16, v12, a2
+; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
   %ptrs = getelementptr inbounds float, ptr %base, <vscale x 8 x i32> %eidxs
@@ -1670,10 +1680,9 @@ define void @vpscatter_baseidx_nxv8f32(<vscale x 8 x float> %val, ptr %base, <vs
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv8f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v12
-; RV64-NEXT:    vsll.vi v16, v16, 2
-; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v16, v12, a2
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds float, ptr %base, <vscale x 8 x i32> %idxs
@@ -1817,20 +1826,20 @@ define void @vpscatter_baseidx_sext_nxv6i8_nxv6f64(<vscale x 6 x double> %val, p
 define void @vpscatter_baseidx_zext_nxv6i8_nxv6f64(<vscale x 6 x double> %val, ptr %base, <vscale x 6 x i8> %idxs, <vscale x 6 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv6i8_nxv6f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v18, v16
-; RV32-NEXT:    vsll.vi v16, v18, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v18, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v18, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv6i8_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v18, v16
-; RV64-NEXT:    vsll.vi v16, v18, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v18, v16, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v18, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 6 x i8> %idxs to <vscale x 6 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 6 x i64> %eidxs
@@ -1841,11 +1850,11 @@ define void @vpscatter_baseidx_zext_nxv6i8_nxv6f64(<vscale x 6 x double> %val, p
 define void @vpscatter_baseidx_nxv6i16_nxv6f64(<vscale x 6 x double> %val, ptr %base, <vscale x 6 x i16> %idxs, <vscale x 6 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_nxv6i16_nxv6f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv6i16_nxv6f64:
@@ -1863,11 +1872,11 @@ define void @vpscatter_baseidx_nxv6i16_nxv6f64(<vscale x 6 x double> %val, ptr %
 define void @vpscatter_baseidx_sext_nxv6i16_nxv6f64(<vscale x 6 x double> %val, ptr %base, <vscale x 6 x i16> %idxs, <vscale x 6 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_nxv6i16_nxv6f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv6i16_nxv6f64:
@@ -1886,20 +1895,20 @@ define void @vpscatter_baseidx_sext_nxv6i16_nxv6f64(<vscale x 6 x double> %val,
 define void @vpscatter_baseidx_zext_nxv6i16_nxv6f64(<vscale x 6 x double> %val, ptr %base, <vscale x 6 x i16> %idxs, <vscale x 6 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv6i16_nxv6f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv6i16_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v20, v16
-; RV64-NEXT:    vsll.vi v16, v20, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v20, v16, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 6 x i16> %idxs to <vscale x 6 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 6 x i64> %eidxs
@@ -1918,10 +1927,11 @@ define void @vpscatter_baseidx_nxv6i32_nxv6f64(<vscale x 6 x double> %val, ptr %
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv6i32_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 6 x i32> %idxs
   call void @llvm.vp.scatter.nxv6f64.nxv6p0(<vscale x 6 x double> %val, <vscale x 6 x ptr> %ptrs, <vscale x 6 x i1> %m, i32 %evl)
@@ -1939,10 +1949,11 @@ define void @vpscatter_baseidx_sext_nxv6i32_nxv6f64(<vscale x 6 x double> %val,
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv6i32_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 6 x i32> %idxs to <vscale x 6 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 6 x i64> %eidxs
@@ -1961,10 +1972,11 @@ define void @vpscatter_baseidx_zext_nxv6i32_nxv6f64(<vscale x 6 x double> %val,
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv6i32_nxv6f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 6 x i32> %idxs to <vscale x 6 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 6 x i64> %eidxs
@@ -2059,20 +2071,20 @@ define void @vpscatter_baseidx_sext_nxv8i8_nxv8f64(<vscale x 8 x double> %val, p
 define void @vpscatter_baseidx_zext_nxv8i8_nxv8f64(<vscale x 8 x double> %val, ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV32-NEXT:    vzext.vf2 v18, v16
-; RV32-NEXT:    vsll.vi v16, v18, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vwmulu.vx v18, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei16.v v8, (a0), v18, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i8_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; RV64-NEXT:    vzext.vf2 v18, v16
-; RV64-NEXT:    vsll.vi v16, v18, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vwmulu.vx v18, v16, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei16.v v8, (a0), v18, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -2083,11 +2095,11 @@ define void @vpscatter_baseidx_zext_nxv8i8_nxv8f64(<vscale x 8 x double> %val, p
 define void @vpscatter_baseidx_nxv8i16_nxv8f64(<vscale x 8 x double> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv8i16_nxv8f64:
@@ -2105,11 +2117,11 @@ define void @vpscatter_baseidx_nxv8i16_nxv8f64(<vscale x 8 x double> %val, ptr %
 define void @vpscatter_baseidx_sext_nxv8i16_nxv8f64(<vscale x 8 x double> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vsext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulsu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv8i16_nxv8f64:
@@ -2128,20 +2140,20 @@ define void @vpscatter_baseidx_sext_nxv8i16_nxv8f64(<vscale x 8 x double> %val,
 define void @vpscatter_baseidx_zext_nxv8i16_nxv8f64(<vscale x 8 x double> %val, ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv8i16_nxv8f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vzext.vf2 v20, v16
-; RV32-NEXT:    vsll.vi v16, v20, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV32-NEXT:    vwmulu.vx v20, v16, a2
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i16_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vzext.vf2 v20, v16
-; RV64-NEXT:    vsll.vi v16, v20, 3
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; RV64-NEXT:    vwmulu.vx v20, v16, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei32.v v8, (a0), v20, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -2160,10 +2172,11 @@ define void @vpscatter_baseidx_nxv8i32_nxv8f64(<vscale x 8 x double> %val, ptr %
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i32> %idxs
   call void @llvm.vp.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> %val, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %m, i32 %evl)
@@ -2181,10 +2194,11 @@ define void @vpscatter_baseidx_sext_nxv8i32_nxv8f64(<vscale x 8 x double> %val,
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulsu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -2203,10 +2217,11 @@ define void @vpscatter_baseidx_zext_nxv8i32_nxv8f64(<vscale x 8 x double> %val,
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv8i32_nxv8f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    li a2, 8
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 8 x i64> %eidxs
@@ -2307,25 +2322,16 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    sub sp, sp, a3
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vl4re16.v v24, (a1)
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v0, v24
+; RV32-NEXT:    vl4re16.v v4, (a1)
+; RV32-NEXT:    li a3, 8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsll.vi v24, v0, 3
+; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v4, a3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB109_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:  .LBB109_2:
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
@@ -2337,11 +2343,6 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64:
@@ -2404,25 +2405,16 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %val, ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    sub sp, sp, a3
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vl4re16.v v24, (a1)
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v0, v24
+; RV32-NEXT:    vl4re16.v v4, (a1)
+; RV32-NEXT:    li a3, 8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsll.vi v24, v0, 3
+; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulsu.vx v24, v4, a3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB110_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:  .LBB110_2:
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
@@ -2434,11 +2426,6 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64:
@@ -2502,25 +2489,16 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %val, ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    sub sp, sp, a3
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vl4re16.v v24, (a1)
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vzext.vf2 v0, v24
+; RV32-NEXT:    vl4re16.v v4, (a1)
+; RV32-NEXT:    li a3, 8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsll.vi v24, v0, 3
+; RV32-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; RV32-NEXT:    vwmulu.vx v24, v4, a3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB111_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:  .LBB111_2:
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
@@ -2532,34 +2510,20 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    vl4re16.v v24, (a1)
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v0, v24
+; RV64-NEXT:    vl4re16.v v4, (a1)
+; RV64-NEXT:    li a3, 8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; RV64-NEXT:    vwmulu.vx v24, v4, a3
 ; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB111_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a3, a1
 ; RV64-NEXT:  .LBB111_2:
-; RV64-NEXT:    addi a4, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
@@ -2571,11 +2535,6 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    and a2, a2, a3
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 16 x i16> %idxs to <vscale x 16 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i64> %eidxs
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index a20f88c..b47edf9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -465,3 +465,131 @@ define float @vreduce_fmaximum_v4f32(float %start, <4 x float> %val, <4 x i1> %m
   %s = call float @llvm.vp.reduce.fmaximum.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 %evl)
   ret float %s
 }
+
+define float @vpreduce_fadd_fpext_vp_nxv1f16_nxv1f32(float %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_fpext_vp_nxv1f16_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfredusum.vs v8, v9, v8, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %w = call <vscale x 1 x float> @llvm.vp.fpext(<vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 %evl)
+  %r = call reassoc float @llvm.vp.reduce.fadd(float %s, <vscale x 1 x float> %w, <vscale x 1 x i1> %m, i32 %evl)
+  ret float %r
+}
+
+define float @vpreduce_ord_fadd_fpext_vp_fpext_nxv1f16_nxv1f32(float %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_fpext_vp_fpext_nxv1f16_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfredosum.vs v8, v9, v8, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %w = call <vscale x 1 x float> @llvm.vp.fpext(<vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 %evl)
+  %r = call float @llvm.vp.reduce.fadd(float %s, <vscale x 1 x float> %w, <vscale x 1 x i1> %m, i32 %evl)
+  ret float %r
+}
+
+define double @vpreduce_fadd_fpext_vp_nxv1f32_nxv1f64(double %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_fpext_vp_nxv1f32_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vfredusum.vs v8, v9, v8, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %w = call <vscale x 1 x double> @llvm.vp.fpext(<vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 %evl)
+  %r = call reassoc double @llvm.vp.reduce.fadd(double %s, <vscale x 1 x double> %w, <vscale x 1 x i1> %m, i32 %evl)
+  ret double %r
+}
+
+define double @vpreduce_ord_fadd_fpext_vp_fpext_nxv1f32_nxv1f64(double %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_fpext_vp_fpext_nxv1f32_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vfredosum.vs v8, v9, v8, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %w = call <vscale x 1 x double> @llvm.vp.fpext(<vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 %evl)
+  %r = call double @llvm.vp.reduce.fadd(double %s, <vscale x 1 x double> %w, <vscale x 1 x i1> %m, i32 %evl)
+  ret double %r
+}
+
+define float @vpreduce_fadd_fpext_nxv1f16_nxv1f32(float %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_fpext_nxv1f16_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfredusum.vs v8, v9, v8, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %w = fpext <vscale x 1 x half> %v to <vscale x 1 x float>
+  %r = call reassoc float @llvm.vp.reduce.fadd(float %s, <vscale x 1 x float> %w, <vscale x 1 x i1> %m, i32 %evl)
+  ret float %r
+}
+
+define float @vpreduce_ord_fadd_fpext_nxv1f16_nxv1f32(float %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_fpext_nxv1f16_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfredosum.vs v8, v9, v8, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %w = fpext <vscale x 1 x half> %v to <vscale x 1 x float>
+  %r = call float @llvm.vp.reduce.fadd(float %s, <vscale x 1 x float> %w, <vscale x 1 x i1> %m, i32 %evl)
+  ret float %r
+}
+
+define double @vpreduce_fadd_fpext_nxv1f32_nxv1f64(double %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_fadd_fpext_nxv1f32_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vfredusum.vs v8, v9, v8, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %w = fpext <vscale x 1 x float> %v to <vscale x 1 x double>
+  %r = call reassoc double @llvm.vp.reduce.fadd(double %s, <vscale x 1 x double> %w, <vscale x 1 x i1> %m, i32 %evl)
+  ret double %r
+}
+
+define double @vpreduce_ord_fadd_fpext_nxv1f32_nxv1f64(double %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpreduce_ord_fadd_fpext_nxv1f32_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vfredosum.vs v8, v9, v8, v0.t
+; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:    ret
+  %w = fpext <vscale x 1 x float> %v to <vscale x 1 x double>
+  %r = call double @llvm.vp.reduce.fadd(double %s, <vscale x 1 x double> %w, <vscale x 1 x i1> %m, i32 %evl)
+  ret double %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index ff807ad..fd09fe7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -224,9 +224,10 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_zext(<vscale x 2 x i32> %a, i8 %b
 define <vscale x 2 x i64> @vwsll_vi_nxv2i64(<vscale x 2 x i32> %a) {
 ; CHECK-LABEL: vwsll_vi_nxv2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a0
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64:
@@ -432,9 +433,10 @@ define <vscale x 4 x i32> @vwsll_vx_i8_nxv4i32_zext(<vscale x 4 x i16> %a, i8 %b
 define <vscale x 4 x i32> @vwsll_vi_nxv4i32(<vscale x 4 x i16> %a) {
 ; CHECK-LABEL: vwsll_vi_nxv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a0
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv4i32:
@@ -612,9 +614,10 @@ define <vscale x 8 x i16> @vwsll_vx_i8_nxv8i16_zext(<vscale x 8 x i8> %a, i8 %b)
 define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a) {
 ; CHECK-LABEL: vwsll_vi_nxv8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a0
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv8i16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
index c30c476..1358a7c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
@@ -240,9 +240,10 @@ define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_zext(<vscale x 2 x i32> %a, i8 %b
 define <vscale x 2 x i64> @vwsll_vi_nxv2i64(<vscale x 2 x i32> %a, <vscale x 2 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vwsll_vi_nxv2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2, v0.t
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64:
@@ -464,9 +465,10 @@ define <vscale x 4 x i32> @vwsll_vx_i8_nxv4i32_zext(<vscale x 4 x i16> %a, i8 %b
 define <vscale x 4 x i32> @vwsll_vi_nxv4i32(<vscale x 4 x i16> %a, <vscale x 4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vwsll_vi_nxv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2, v0.t
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv4i32:
@@ -661,9 +663,10 @@ define <vscale x 8 x i16> @vwsll_vx_i8_nxv8i16_zext(<vscale x 8 x i8> %a, i8 %b,
 define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a, <vscale x 8 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vwsll_vi_nxv8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vsll.vi v8, v10, 2, v0.t
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vwmulu.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_nxv8i16:
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
index 7e48b37..5d7c337c 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
@@ -18,18 +18,18 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    kmovw %eax, %k3
 ; AVX512F-NEXT:    kmovw %edi, %k4
-; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm4, %xmm4
 ; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
@@ -37,7 +37,7 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm2 & (zmm0 ^ zmm4))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: add_v64i8_broadcasts:
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index c27cced..23b46ee 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -166,7 +166,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; KNL-NEXT:    pushq %rax
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
 ; KNL-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    callq _func16xi1
 ; KNL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -194,7 +194,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; KNL_X32-NEXT:    subl $12, %esp
 ; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
 ; KNL_X32-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL_X32-NEXT:    calll _func16xi1
 ; KNL_X32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index e463788..56d6d13 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -190,7 +190,7 @@ define <8 x i32> @legalize_loop(<8 x double> %arg) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; KNL-NEXT:    vcmpnltpd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[3,2,1,0,7,6,5,4]
 ; KNL-NEXT:    vpsrld $31, %ymm1, %ymm1
 ; KNL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 79e59fd..d19eaf4 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -1603,7 +1603,7 @@ define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
 ; KNL-LABEL: zext_16i1_to_16xi32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpsrld $31, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -1629,7 +1629,7 @@ define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
 ; KNL-LABEL: zext_8i1_to_8xi64:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    vpsrlq $63, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
@@ -1747,14 +1747,14 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 ; KNL-LABEL: sext_8i1_8i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
+; KNL-NEXT:    vpternlogq {{.*#+}} zmm0 = ~zmm0
 ; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; KNL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sext_8i1_8i32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vpternlogq $15, %ymm0, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0
 ; AVX512DQ-NEXT:    retq
   %x = icmp slt <8 x i32> %a1, %a2
   %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
@@ -1840,7 +1840,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
 ; KNL-LABEL: sext_16i1_16i32:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; KNL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sext_16i1_16i32:
@@ -2313,12 +2313,12 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
 ; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL-NEXT:    korw %k2, %k0, %k2
-; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; KNL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1
 ; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; KNL-NEXT:    vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1
 ; KNL-NEXT:    vpmovdw %zmm2, %ymm2
 ; KNL-NEXT:    vpmovdw %zmm3, %ymm3
 ; KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
index 6c661eb..5115c3c 100644
--- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -485,7 +485,7 @@ define void @load_v64i1_broadcast_32_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
-; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512NOTDQ-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; AVX512NOTDQ-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
@@ -642,7 +642,7 @@ define void @load_v64i1_broadcast_63_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
 ; AVX512NOTDQ-FAST:       # %bb.0:
 ; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-FAST-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm2, %zmm3, %zmm2
 ; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm2, %zmm2, %k1
@@ -654,7 +654,7 @@ define void @load_v64i1_broadcast_63_v16i1(ptr %a0,<16 x float> %a1,<16 x float>
 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
 ; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm2, %zmm2, %k1
@@ -1426,7 +1426,7 @@ define void @load_v64i1_broadcast_32_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
 ; AVX512NOTDQ:       # %bb.0:
 ; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
-; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; AVX512NOTDQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512NOTDQ-NEXT:    kmovw %k0, (%rsi)
@@ -1596,7 +1596,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
 ; AVX512NOTDQ-FAST:       # %bb.0:
 ; AVX512NOTDQ-FAST-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-FAST-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512NOTDQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512NOTDQ-FAST-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1607,7 +1607,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) {
 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
 ; AVX512NOTDQ-FAST-PERLANE:       # %bb.0:
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    kmovw 6(%rdi), %k1
-; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
 ; AVX512NOTDQ-FAST-PERLANE-NEXT:    vptestmd %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/avx512-load-store.ll b/llvm/test/CodeGen/X86/avx512-load-store.ll
index c32c3d9..ce6bfa90 100644
--- a/llvm/test/CodeGen/X86/avx512-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-load-store.ll
@@ -143,7 +143,7 @@ define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, ptr
 ; CHECK64-LABEL: test_mm_mask_load_ss:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_ss:
@@ -151,7 +151,7 @@ define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, ptr
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
@@ -168,7 +168,7 @@ define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, pt
 ; CHECK64-LABEL: test_mm_mask_load_sd:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_sd:
@@ -176,7 +176,7 @@ define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, pt
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
@@ -192,7 +192,7 @@ define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, ptr %__W) local_unnam
 ; CHECK64-LABEL: test_mm_maskz_load_ss:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_ss:
@@ -200,7 +200,7 @@ define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, ptr %__W) local_unnam
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -215,7 +215,7 @@ define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, ptr %__W) local_unna
 ; CHECK64-LABEL: test_mm_maskz_load_sd:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_sd:
@@ -223,7 +223,7 @@ define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, ptr %__W) local_unna
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -283,7 +283,7 @@ define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, pt
 ; CHECK64-LABEL: test_mm_mask_load_ss_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_ss_2:
@@ -291,7 +291,7 @@ define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, pt
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
@@ -306,7 +306,7 @@ define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK64-LABEL: test_mm_maskz_load_ss_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_ss_2:
@@ -314,7 +314,7 @@ define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
@@ -328,7 +328,7 @@ define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U,
 ; CHECK64-LABEL: test_mm_mask_load_sd_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_mask_load_sd_2:
@@ -336,7 +336,7 @@ define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U,
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %shuffle3.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
@@ -351,7 +351,7 @@ define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK64-LABEL: test_mm_maskz_load_sd_2:
 ; CHECK64:       # %bb.0: # %entry
 ; CHECK64-NEXT:    kmovw %edi, %k1
-; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK64-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_sd_2:
@@ -359,7 +359,7 @@ define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, ptr readonly %__W)
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    kmovw %ecx, %k1
-; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll
index e53e194..23f4fcb 100644
--- a/llvm/test/CodeGen/X86/avx512-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512-logic.ll
@@ -856,7 +856,7 @@ entry:
 define <16 x i32> @ternlog_and_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_and_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $8, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm2 & zmm1 & ~zmm0
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -867,7 +867,7 @@ define <16 x i32> @ternlog_and_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_or_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_or_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $206, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm1 & ~zmm0) | zmm2
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -878,7 +878,7 @@ define <16 x i32> @ternlog_or_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
 ; ALL-LABEL: ternlog_xor_andn:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $198, %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm2 ^ (zmm1 & ~zmm0)
 ; ALL-NEXT:    retq
   %a = xor <16 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
   %b = and <16 x i32> %y, %a
@@ -889,7 +889,7 @@ define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 ; ALL-LABEL: ternlog_or_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm0 & mem) | zmm1
 ; ALL-NEXT:    retq
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %b = or <16 x i32> %a, %y
@@ -899,7 +899,7 @@ define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) {
 ; ALL-LABEL: ternlog_xor_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
+; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
 ; ALL-NEXT:    retq
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   %b = xor <8 x i64> %a, %y
@@ -911,7 +911,7 @@ define <16 x i32> @ternlog_maskz_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
 ; ALL-NEXT:    vpsrad $31, %zmm2, %zmm0
-; ALL-NEXT:    vpternlogd $224, %zmm1, %zmm3, %zmm0
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 & (zmm3 | zmm1)
 ; ALL-NEXT:    retq
   %m = icmp slt <16 x i32> %mask, zeroinitializer
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -925,7 +925,7 @@ define <8 x i64> @ternlog_maskz_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i6
 ; ALL:       ## %bb.0:
 ; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm3
 ; ALL-NEXT:    vpsraq $63, %zmm2, %zmm0
-; ALL-NEXT:    vpternlogq $96, %zmm1, %zmm3, %zmm0
+; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 & (zmm3 ^ zmm1)
 ; ALL-NEXT:    retq
   %m = icmp slt <8 x i64> %mask, zeroinitializer
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index 536c667..721ffbe 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -502,7 +502,7 @@ define <16 x i64> @narrowExtractedVectorSelect_crash(<16 x i64> %arg, <16 x i16>
 ; X86-AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; X86-AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; X86-AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; X86-AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; X86-AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; X86-AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
 ; X86-AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
@@ -515,7 +515,7 @@ define <16 x i64> @narrowExtractedVectorSelect_crash(<16 x i64> %arg, <16 x i16>
 ; X64-AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; X64-AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; X64-AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; X64-AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; X64-AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; X64-AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm1
 ; X64-AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/cmp-xor.ll b/llvm/test/CodeGen/X86/cmp-xor.ll
index 6be5cea..a8a7da4 100644
--- a/llvm/test/CodeGen/X86/cmp-xor.ll
+++ b/llvm/test/CodeGen/X86/cmp-xor.ll
@@ -9,22 +9,18 @@
 define i32 @cmp_xor_i32(i32 %a, i32 %b, i32 %c)
 ; X86-LABEL: cmp_xor_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    je .LBB0_1
-; X86-NEXT:  # %bb.2:
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    jne .LBB0_2
+; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB0_2:
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmp_xor_i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    xorl %esi, %eax
-; X64-NEXT:    cmpl %esi, %edi
 ; X64-NEXT:    cmovel %edx, %eax
 ; X64-NEXT:    retq
 {
@@ -37,22 +33,18 @@ define i32 @cmp_xor_i32(i32 %a, i32 %b, i32 %c)
 define i32 @cmp_xor_i32_commute(i32 %a, i32 %b, i32 %c)
 ; X86-LABEL: cmp_xor_i32_commute:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    je .LBB1_1
-; X86-NEXT:  # %bb.2:
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    jne .LBB1_2
+; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB1_2:
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmp_xor_i32_commute:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    xorl %edi, %eax
-; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %esi, %eax
 ; X64-NEXT:    cmovel %edx, %eax
 ; X64-NEXT:    retq
 {
diff --git a/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll b/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll
new file mode 100644
index 0000000..b26345e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/merge-huge-sp-updates.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=x86_64-linux-unknown -verify-machineinstrs -o %t.s
+; RUN: FileCheck --input-file=%t.s %s
+
+; Double-check that we are able to assemble the generated '.s'. A symptom of the
+; problem that led to this test is an assembler failure when using
+; '-save-temps'. For example:
+;  
+; > ...s:683:7: error: invalid operand for instruction
+; >        addq    $2147483679, %rsp               # imm = 0x8000001F
+;
+; RUN: llvm-mc -triple x86_64-unknown-unknown %t.s
+
+; Check that the stack update after calling bar gets merged into the second add
+; and not the first which is already at the chunk size limit (0x7FFFFFFF).
+
+define void @foo(ptr %rhs) {
+; CHECK-LABEL: foo
+entry:
+  %lhs = alloca [5 x [5 x [3 x [162 x [161 x [161 x double]]]]]], align 16
+  store ptr %lhs, ptr %rhs, align 8
+  %0 = call i32 @baz()
+  call void @bar(i64 0, i64 0, i64 0, i64 0, i64 0, ptr null, ptr %rhs, ptr null, ptr %rhs)
+; CHECK: call{{.*}}bar
+; CHECK: addq{{.*}}$2147483647, %rsp
+; CHECK: addq{{.*}}$372037585, %rsp
+; CHECK: .cfi_adjust_cfa_offset -2519521232
+  ret void
+}
+
+declare void @bar(i64, i64, i64, i64, i64, ptr, ptr, ptr, ptr)
+
+declare i32 @baz()
diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll
index 90fb76a..8a726a4 100644
--- a/llvm/test/CodeGen/X86/pr32284.ll
+++ b/llvm/test/CodeGen/X86/pr32284.ll
@@ -321,11 +321,9 @@ define void @f2() {
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    sete %cl
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    xorl %ecx, %edx
-; X64-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    xorl %eax, %ecx
+; X64-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    movw %dx, (%rax)
 ; X64-NEXT:    retq
@@ -366,17 +364,15 @@ define void @f2() {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $2, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 6
-; X86-NEXT:    movzbl var_7, %ecx
+; X86-NEXT:    movzbl var_7, %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movw %dx, (%esp)
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sete %dl
-; X86-NEXT:    movw %dx, (%eax)
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    sete %cl
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    addl $2, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-space.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-space.ll
new file mode 100644
index 0000000..a96b519
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/debug-addr-space.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+
+; Test that translateToNVVMDWARFAddrSpace() function translates NVVM IR address space
+; value `Shared` (3) to the corresponding DWARF DW_AT_address_class attribute for PTX.
+
+; CHECK: .section .debug_info
+; CHECK:      .b8 103                                 // DW_AT_name
+; CHECK-NEXT: .b8 0
+; CHECK-NEXT: .b32 55                                 // DW_AT_type
+; CHECK-NEXT: .b8 1                                   // DW_AT_decl_file
+; CHECK-NEXT: .b8 1                                   // DW_AT_decl_line
+; CHECK-NEXT: .b8 8                                   // DW_AT_address_class
+
+@g = internal addrspace(3) global i32 0, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!6}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "g", linkageName: "g", scope: !2, file: !3, line: 1, type: !5, isLocal: true, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4)
+!3 = !DIFile(filename: "test.cu", directory: "test")
+!4 = !{!0}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll b/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
new file mode 100644
index 0000000..1b8f451
--- /dev/null
+++ b/llvm/test/ExecutionEngine/Orc/minimal-throw-catch.ll
@@ -0,0 +1,52 @@
+; REQUIRES: x86_64-apple
+; RUN: lli -jit-kind=orc %s
+;
+; Basic correctness testing for eh-frame processing and registration.
+
+@_ZTIi = external constant ptr
+
+declare ptr @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(ptr, ptr, ptr)
+
+declare i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(ptr)
+declare ptr @__cxa_begin_catch(ptr)
+declare void @__cxa_end_catch()
+
+define void @explode() {
+entry:
+  %exception = tail call ptr @__cxa_allocate_exception(i64 4)
+  store i32 42, ptr %exception, align 16
+  tail call void @__cxa_throw(ptr %exception, ptr @_ZTIi, ptr null)
+  unreachable
+}
+
+define i32 @main(i32 %argc, ptr %argv) personality ptr @__gxx_personality_v0 {
+entry:
+  invoke void @explode()
+          to label %return unwind label %lpad
+
+lpad:
+  %0 = landingpad { ptr, i32 }
+          catch ptr @_ZTIi
+  %1 = extractvalue { ptr, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:
+  %3 = extractvalue { ptr, i32 } %0, 0
+  %4 = tail call ptr @__cxa_begin_catch(ptr %3)
+  %5 = load i32, ptr %4, align 4
+  %cmp = icmp ne i32 %5, 42
+  %cond = zext i1 %cmp to i32
+  tail call void @__cxa_end_catch()
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %cond, %catch ], [ 2, %entry ]
+  ret i32 %retval.0
+
+eh.resume:
+  resume { ptr, i32 } %0
+}
diff --git a/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll b/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
index 5bc5769..cd22ec6 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/minimal-throw-catch.ll
@@ -3,10 +3,6 @@
 ;
 ; Basic correctness testing for eh-frame processing and registration.
 
-source_filename = "minimal-throw-catch.cpp"
-target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.14.0"
-
 @_ZTIi = external constant ptr
 
 declare ptr @__cxa_allocate_exception(i64)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
index f0e607d..3a2ecfe 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
@@ -2,11 +2,6 @@
 ; RUN: opt < %s -passes=msan -S | FileCheck %s
 ;
 ; Forked from llvm/test/CodeGen/AArch64/arm64-vaddv.ll
-;
-; Incorrectly handled by handleUnknownInstruction:
-; - llvm.aarch64.neon.faddv
-; - llvm.aarch64.neon.saddv
-; - llvm.aarch64.neon.uaddv
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android9001"
@@ -17,16 +12,12 @@ define signext i8 @test_vaddv_s8(<8 x i8> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A1]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i8
-; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i8 [[TMP4]]
 ;
 entry:
@@ -42,16 +33,12 @@ define <8 x i8> @test_vaddv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i8
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP1]], i8 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP1]], i8 [[_MSPROP1]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i8> [[A1]], i8 [[TMP6]], i32 3
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP7]]
@@ -69,16 +56,12 @@ define signext i16 @test_vaddv_s16(<4 x i16> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A1]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i16
-; CHECK-NEXT:    store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i16 [[TMP4]]
 ;
 entry:
@@ -94,16 +77,12 @@ define <4 x i16> @test_vaddv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i16
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP1]], i16 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[_MSPROP1]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i16> [[A1]], i16 [[TMP6]], i32 3
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP7]]
@@ -121,15 +100,9 @@ define i32 @test_vaddv_s32(<2 x i32> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[VADDV_I]]
 ;
 ; 2 x i32 is not supported by the ISA, thus, this is a special case
@@ -145,15 +118,9 @@ define <2 x i32> @test_vaddv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A2]])
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[A1]], i32 [[TMP5]], i32 1
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP6]]
@@ -170,15 +137,9 @@ define i64 @test_vaddv_s64(<2 x i64> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[VADDV_I]]
 ;
 entry:
@@ -193,15 +154,9 @@ define <2 x i64> @test_vaddv_s64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A2]])
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[A1]], i64 [[TMP5]], i64 1
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
@@ -218,16 +173,12 @@ define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A1]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i8
-; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i8 [[TMP4]]
 ;
 entry:
@@ -243,16 +194,12 @@ define <8 x i8> @test_vaddv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i8
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP1]], i8 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> [[TMP1]], i8 [[_MSPROP1]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i8> [[A1]], i8 [[TMP6]], i32 3
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP7]]
@@ -270,17 +217,14 @@ define i32 @test_vaddv_u8_masked(<8 x i8> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[VADDV_I]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = or i32 0, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP2]], 511
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[VADDV_I]], 511
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP7]]
@@ -297,16 +241,12 @@ define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A1]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i16
-; CHECK-NEXT:    store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i16 [[TMP4]]
 ;
 entry:
@@ -322,16 +262,12 @@ define <4 x i16> @test_vaddv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i16
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP1]], i16 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[_MSPROP1]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i16> [[A1]], i16 [[TMP6]], i32 3
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP7]]
@@ -349,17 +285,14 @@ define i32 @test_vaddv_u16_masked(<4 x i16> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[VADDV_I]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = or i32 0, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP2]], 3276799
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[VADDV_I]], 3276799
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP7]]
@@ -376,15 +309,9 @@ define i32 @test_vaddv_u32(<2 x i32> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[VADDV_I]]
 ;
 ; 2 x i32 is not supported by the ISA, thus, this is a special case
@@ -400,15 +327,9 @@ define <2 x i32> @test_vaddv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[A2]])
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[A1]], i32 [[TMP5]], i32 1
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP6]]
@@ -425,15 +346,9 @@ define float @test_vaddv_f32(<2 x float> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[VADDV_I]]
 ;
 entry:
@@ -447,15 +362,9 @@ define float @test_vaddv_v4f32(<4 x float> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[VADDV_I]]
 ;
 entry:
@@ -469,15 +378,9 @@ define double @test_vaddv_f64(<2 x double> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[VADDV_I]]
 ;
 entry:
@@ -491,15 +394,9 @@ define i64 @test_vaddv_u64(<2 x i64> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[VADDV_I]]
 ;
 entry:
@@ -514,15 +411,9 @@ define <2 x i64> @test_vaddv_u64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A2]])
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[A1]], i64 [[TMP5]], i64 1
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
@@ -540,15 +431,9 @@ define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1, <1 x i64> %param1) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A1]])
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> [[TMP2]], i64 0, i32 0
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[VEC:%.*]] = insertelement <1 x i64> [[PARAM1]], i64 [[VADDV_I]], i32 0
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[VEC]]
@@ -565,16 +450,12 @@ define signext i8 @test_vaddvq_s8(<16 x i8> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A1]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i8
-; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i8 [[TMP4]]
 ;
 entry:
@@ -590,16 +471,12 @@ define <16 x i8> @test_vaddvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i8
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> [[TMP1]], i8 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[_MSPROP1]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[A1]], i8 [[TMP6]], i32 3
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP7]]
@@ -617,16 +494,12 @@ define signext i16 @test_vaddvq_s16(<8 x i16> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A1]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i16
-; CHECK-NEXT:    store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i16 [[TMP4]]
 ;
 entry:
@@ -642,16 +515,12 @@ define <8 x i16> @test_vaddvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) #
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i16
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> [[TMP1]], i16 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[_MSPROP1]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i16> [[A1]], i16 [[TMP6]], i32 3
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP7]]
@@ -669,15 +538,9 @@ define i32 @test_vaddvq_s32(<4 x i32> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[VADDV_I]]
 ;
 entry:
@@ -692,15 +555,9 @@ define <4 x i32> @test_vaddvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) #
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A2]])
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[A1]], i32 [[TMP5]], i32 3
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
@@ -717,16 +574,12 @@ define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A1]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i8
-; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i8 [[TMP4]]
 ;
 entry:
@@ -742,16 +595,12 @@ define <16 x i8> @test_vaddvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i8
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> [[TMP1]], i8 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[_MSPROP1]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[A1]], i8 [[TMP6]], i32 3
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP7]]
@@ -769,16 +618,12 @@ define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A1]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[VADDV_I]] to i16
-; CHECK-NEXT:    store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i16 [[TMP4]]
 ;
 entry:
@@ -794,16 +639,12 @@ define <8 x i16> @test_vaddvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) #
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = trunc i32 [[TMP3]] to i16
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> [[TMP1]], i16 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[_MSPROP1]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i16> [[A1]], i16 [[TMP6]], i32 3
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP7]]
@@ -821,15 +662,9 @@ define i32 @test_vaddvq_u32(<4 x i32> %a1) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[VADDV_I]]
 ;
 entry:
@@ -844,15 +679,9 @@ define <4 x i32> @test_vaddvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) #
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A2]])
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i32 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[A1]], i32 [[TMP5]], i32 3
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
@@ -896,6 +725,3 @@ declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
 declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
 
 attributes #0 = { sanitize_memory }
-;.
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
-;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fadd.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fadd.ll
index 306a262..5da4c73 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fadd.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fadd.ll
@@ -15,17 +15,10 @@ define float @test_v2f32(float %a0, <2 x float> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fadd.v2f32(float [[A0]], <2 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
   %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1)
@@ -38,17 +31,10 @@ define float @test_v4f32(float %a0, <4 x float> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[A0]], <4 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
   %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1)
@@ -61,17 +47,10 @@ define float @test_v8f32(float %a0, <8 x float> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[A0]], <8 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
   %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
@@ -84,17 +63,10 @@ define float @test_v16f32(float %a0, <16 x float> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fadd.v16f32(float [[A0]], <16 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
   %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1)
@@ -107,15 +79,10 @@ define float @test_v2f32_zero(<2 x float> %a0) #0 {
 ; CHECK-SAME: <2 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[A0]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %a0)
@@ -127,15 +94,10 @@ define float @test_v4f32_zero(<4 x float> %a0) #0 {
 ; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[A0]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0)
@@ -147,15 +109,10 @@ define float @test_v8f32_zero(<8 x float> %a0) #0 {
 ; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[A0]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a0)
@@ -167,15 +124,10 @@ define float @test_v16f32_zero(<16 x float> %a0) #0 {
 ; CHECK-SAME: <16 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[A0]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float -0.0, <16 x float> %a0)
@@ -188,17 +140,10 @@ define double @test_v2f64(double %a0, <2 x double> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[A0]], <2 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
   %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1)
@@ -211,17 +156,10 @@ define double @test_v4f64(double %a0, <4 x double> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double [[A0]], <4 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
   %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
@@ -234,17 +172,10 @@ define double @test_v8f64(double %a0, <8 x double> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.vector.reduce.fadd.v8f64(double [[A0]], <8 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
   %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1)
@@ -257,17 +188,10 @@ define double @test_v16f64(double %a0, <16 x double> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i64> [[TMP2]] to i1024
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i1024 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.vector.reduce.fadd.v16f64(double [[A0]], <16 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
   %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1)
@@ -280,15 +204,10 @@ define double @test_v2f64_zero(<2 x double> %a0) #0 {
 ; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[A0]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %a0)
@@ -300,15 +219,10 @@ define double @test_v4f64_zero(<4 x double> %a0) #0 {
 ; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[A0]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a0)
@@ -320,15 +234,10 @@ define double @test_v8f64_zero(<8 x double> %a0) #0 {
 ; CHECK-SAME: <8 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[A0]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double -0.0, <8 x double> %a0)
@@ -340,15 +249,10 @@ define double @test_v16f64_zero(<16 x double> %a0) #0 {
 ; CHECK-SAME: <16 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i64> [[TMP1]] to i1024
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i1024 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.vector.reduce.fadd.v16f64(double -0.000000e+00, <16 x double> [[A0]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double -0.0, <16 x double> %a0)
@@ -363,15 +267,10 @@ define float @PR64627() #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <5 x i1> [[TMP1]], <5 x i32> zeroinitializer, <5 x i32> zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <5 x i1> zeroinitializer, <5 x i32> splat (i32 1065353216), <5 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <5 x i1> [[TMP1]], <5 x float> zeroinitializer, <5 x float> splat (float 1.000000e+00)
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <5 x i32> [[_MSPROP_SELECT]] to i160
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i160 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.or.v5i32(<5 x i32> [[_MSPROP_SELECT]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 0, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fadd.v5f32(float -0.000000e+00, <5 x float> [[TMP3]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP7]]
 ;
   %1 = bitcast i5 0 to <5 x i1>
@@ -392,6 +291,3 @@ declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>)
 declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>)
 
 attributes #0 = { sanitize_memory }
-;.
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
-;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fmul.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fmul.ll
index 4223bf4..0c1c4ed 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fmul.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fmul.ll
@@ -15,17 +15,10 @@ define float @test_v2f32(float %a0, <2 x float> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmul.v2f32(float [[A0]], <2 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
   %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1)
@@ -38,17 +31,10 @@ define float @test_v4f32(float %a0, <4 x float> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float [[A0]], <4 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
   %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1)
@@ -61,17 +47,10 @@ define float @test_v8f32(float %a0, <8 x float> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmul.v8f32(float [[A0]], <8 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
   %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1)
@@ -84,17 +63,10 @@ define float @test_v16f32(float %a0, <16 x float> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmul.v16f32(float [[A0]], <16 x float> [[A1]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP6]]
 ;
   %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1)
@@ -107,15 +79,10 @@ define float @test_v2f32_one(<2 x float> %a0) #0 {
 ; CHECK-SAME: <2 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.vector.reduce.fmul.v2f32(float 1.000000e+00, <2 x float> [[A0]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0)
@@ -127,15 +94,10 @@ define float @test_v4f32_one(<4 x float> %a0) #0 {
 ; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[A0]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0)
@@ -147,15 +109,10 @@ define float @test_v8f32_one(<8 x float> %a0) #0 {
 ; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> [[A0]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0)
@@ -167,15 +124,10 @@ define float @test_v16f32_one(<16 x float> %a0) #0 {
 ; CHECK-SAME: <16 x float> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[A0]])
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0)
@@ -188,17 +140,10 @@ define double @test_v2f64(double %a0, <2 x double> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.vector.reduce.fmul.v2f64(double [[A0]], <2 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
   %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1)
@@ -211,17 +156,10 @@ define double @test_v4f64(double %a0, <4 x double> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.vector.reduce.fmul.v4f64(double [[A0]], <4 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
   %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1)
@@ -234,17 +172,10 @@ define double @test_v8f64(double %a0, <8 x double> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.vector.reduce.fmul.v8f64(double [[A0]], <8 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
   %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1)
@@ -257,17 +188,10 @@ define double @test_v16f64(double %a0, <16 x double> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i64> [[TMP2]] to i1024
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i1024 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.vector.reduce.fmul.v16f64(double [[A0]], <16 x double> [[A1]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP6]]
 ;
   %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1)
@@ -280,15 +204,10 @@ define double @test_v2f64_one(<2 x double> %a0) #0 {
 ; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.vector.reduce.fmul.v2f64(double 1.000000e+00, <2 x double> [[A0]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0)
@@ -300,15 +219,10 @@ define double @test_v4f64_one(<4 x double> %a0) #0 {
 ; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[A0]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0)
@@ -320,15 +234,10 @@ define double @test_v8f64_one(<8 x double> %a0) #0 {
 ; CHECK-SAME: <8 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[A0]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0)
@@ -340,15 +249,10 @@ define double @test_v16f64_one(<16 x double> %a0) #0 {
 ; CHECK-SAME: <16 x double> [[A0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i64> [[TMP1]] to i1024
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i1024 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 0, [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.vector.reduce.fmul.v16f64(double 1.000000e+00, <16 x double> [[A0]])
-; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i64 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0)
@@ -366,6 +270,3 @@ declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>)
 declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>)
 
 attributes #0 = { sanitize_memory }
-;.
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
-;.
diff --git a/llvm/test/MC/ARM/cortex-r52-nofp.s b/llvm/test/MC/ARM/cortex-r52-nofp.s
new file mode 100644
index 0000000..cc72cec
--- /dev/null
+++ b/llvm/test/MC/ARM/cortex-r52-nofp.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc -triple armv8-none-eabi -mcpu=cortex-r52 -mattr=+nosimd+nofp.dp %s -o - | FileCheck %s -check-prefix=CHECK-NO-FP
+@ RUN: llvm-mc -triple armv8-none-eabi -mcpu=cortex-r52 %s -o - | FileCheck %s -check-prefix=CHECK-FP
+
+.text
+vadd.f32 s0, s1, s2
+@ CHECK-NO-FP: vadd.f32 s0, s1, s2
+@ CHECK-FP: vadd.f32 s0, s1, s2
+@ CHECK-NOT-NO-FP: error: instruction requires: VPF2
+@ CHECK-NOT-FP: error: instruction requires: VPF2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
index 05c687a..a360339 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
@@ -6,6 +6,9 @@
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -59,6 +62,9 @@ exit:
 }
 
 define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -112,6 +118,9 @@ exit:
 }
 
 define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -165,6 +174,9 @@ exit:
 }
 
 define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -218,6 +230,9 @@ exit:
 }
 
 define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -266,6 +281,9 @@ exit:
 }
 
 define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -314,6 +332,9 @@ exit:
 }
 
 define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -366,6 +387,9 @@ exit:
 }
 
 define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -418,6 +442,9 @@ exit:
 }
 
 define void @vp_abs(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
index f4acfd6..d64a24b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
@@ -5,6 +5,9 @@
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 define void @vp_sext(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -55,6 +58,9 @@ exit:
 }
 
 define void @vp_zext(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -103,6 +109,9 @@ exit:
 }
 
 define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -151,6 +160,9 @@ exit:
 }
 
 define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -199,6 +211,9 @@ exit:
 }
 
 define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -247,6 +262,9 @@ exit:
 }
 
 define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -295,6 +313,9 @@ exit:
 }
 
 define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -343,6 +364,9 @@ exit:
 }
 
 define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -391,6 +415,9 @@ exit:
 }
 
 define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
@@ -439,6 +466,9 @@ exit:
 }
 
 define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index cd1d734..aacf632 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -3,13 +3,13 @@
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
 ; RUN: -force-tail-folding-style=data-with-evl \
 ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
-; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-OUTLOOP %s
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-OUTLOOP,IF-EVL %s
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
 ; RUN: -force-tail-folding-style=data-with-evl \
 ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
-; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-INLOOP %s
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-INLOOP,IF-EVL %s
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
 ; RUN: -force-tail-folding-style=none \
@@ -24,6 +24,9 @@
 
 
 define i32 @reduction(ptr %a, i64 %n, i32 %start) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+
 ; IF-EVL-OUTLOOP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-OUTLOOP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index 706b6f8..5824a96 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -11,6 +11,9 @@
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s
 
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+;
 ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
 ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
 ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 53c9fb0..74e2503 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -6,6 +6,11 @@
  ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
  define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+ ; IF-EVL: VPlan 'Initial VPlan for VF={1},UF={1}'
+ ; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+ ; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}'
+ ; IF-EVL: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+ ;
  ; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
  ; IF-EVL-NEXT: Live-in ir<[[VFUF:%.+]]> = VF * UF
  ; IF-EVL-NEXT: Live-in ir<[[VTC:%.+]]> = vector-trip-count
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
index 84eba81..a3af048 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S                                        | FileCheck %s
-; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
-
-target triple = "x86_64--"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
 
 ; Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
 ; That may require some coordination between VectorCombine, SLP, and other passes.
@@ -97,3 +100,44 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
   store float %add10, ptr %r3, align 4
   ret void
 }
+
+; PR58139
+define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: @_mm_complexmult_pd_naive(
+; SSE-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
+; SSE-NEXT:    [[TMP1:%.*]] = fneg double [[B1]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0
+; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]])
+; SSE-NEXT:    ret <2 x double> [[TMP7]]
+;
+; AVX-LABEL: @_mm_complexmult_pd_naive(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i64 1
+; AVX-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; AVX-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i64 1
+; AVX-NEXT:    [[MUL10:%.*]] = fmul double [[A1]], [[B0]]
+; AVX-NEXT:    [[TMP1:%.*]] = fneg double [[B1]]
+; AVX-NEXT:    [[NEG11:%.*]] = fmul double [[A1]], [[TMP1]]
+; AVX-NEXT:    [[MADD0:%.*]] = tail call double @llvm.fmuladd.f64(double [[A0]], double [[B0]], double [[NEG11]])
+; AVX-NEXT:    [[MADD1:%.*]] = tail call double @llvm.fmuladd.f64(double [[A0]], double [[B1]], double [[MUL10]])
+; AVX-NEXT:    [[RES0:%.*]] = insertelement <2 x double> poison, double [[MADD0]], i64 0
+; AVX-NEXT:    [[RES1:%.*]] = insertelement <2 x double> [[RES0]], double [[MADD1]], i64 1
+; AVX-NEXT:    ret <2 x double> [[RES1]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %mul10 = fmul double %a1, %b0
+  %mul11 = fmul double %a1, %b1
+  %neg11 = fneg double %mul11
+  %madd0 = call double @llvm.fmuladd.f64(double %a0, double %b0, double %neg11)
+  %madd1 = call double @llvm.fmuladd.f64(double %a0, double %b1, double %mul10)
+  %res0 = insertelement <2 x double> poison, double %madd0, i32 0
+  %res1 = insertelement <2 x double> %res0, double %madd1, i32 1
+  ret <2 x double> %res1
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index 870ae4e..40dc2aa 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S                                        | FileCheck %s
-; RUN: opt < %s -passes="default<O3>" -S | FileCheck %s
-
-target triple = "x86_64--"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
 
 ; Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
 ; That may require some coordination between VectorCombine, SLP, and other passes.
@@ -97,3 +100,44 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
   store float %add10, ptr %r3, align 4
   ret void
 }
+
+; PR58139
+define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: @_mm_complexmult_pd_naive(
+; SSE-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
+; SSE-NEXT:    [[TMP1:%.*]] = fneg double [[B1]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0
+; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]])
+; SSE-NEXT:    ret <2 x double> [[TMP7]]
+;
+; AVX-LABEL: @_mm_complexmult_pd_naive(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i64 1
+; AVX-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; AVX-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i64 1
+; AVX-NEXT:    [[MUL10:%.*]] = fmul double [[A1]], [[B0]]
+; AVX-NEXT:    [[TMP1:%.*]] = fneg double [[B1]]
+; AVX-NEXT:    [[NEG11:%.*]] = fmul double [[A1]], [[TMP1]]
+; AVX-NEXT:    [[MADD0:%.*]] = tail call double @llvm.fmuladd.f64(double [[A0]], double [[B0]], double [[NEG11]])
+; AVX-NEXT:    [[MADD1:%.*]] = tail call double @llvm.fmuladd.f64(double [[A0]], double [[B1]], double [[MUL10]])
+; AVX-NEXT:    [[RES0:%.*]] = insertelement <2 x double> poison, double [[MADD0]], i64 0
+; AVX-NEXT:    [[RES1:%.*]] = insertelement <2 x double> [[RES0]], double [[MADD1]], i64 1
+; AVX-NEXT:    ret <2 x double> [[RES1]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %mul10 = fmul double %a1, %b0
+  %mul11 = fmul double %a1, %b1
+  %neg11 = fneg double %mul11
+  %madd0 = call double @llvm.fmuladd.f64(double %a0, double %b0, double %neg11)
+  %madd1 = call double @llvm.fmuladd.f64(double %a0, double %b1, double %mul10)
+  %res0 = insertelement <2 x double> undef, double %madd0, i32 0
+  %res1 = insertelement <2 x double> %res0, double %madd1, i32 1
+  ret <2 x double> %res1
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 5bfd776..f6f87dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -26,7 +26,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @fabsf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @fabsf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -60,7 +60,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.fabs.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.fabs.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -94,7 +94,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @sqrtf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @sqrtf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -128,7 +128,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.sqrt.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.sqrt.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -175,7 +175,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @expf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @expf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -222,7 +222,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.exp.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.exp.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -269,7 +269,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @logf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @logf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -316,7 +316,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.log.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.log.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -363,7 +363,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @sinf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @sinf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -410,7 +410,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.sin.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -457,7 +457,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @asinf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @asinf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -504,7 +504,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.asin.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.asin.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -551,7 +551,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @cosf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @cosf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -598,7 +598,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.cos.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -645,7 +645,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @acosf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @acosf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -692,7 +692,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.acos.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.acos.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -739,7 +739,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @tanf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @tanf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -786,7 +786,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.tan.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.tan.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -833,7 +833,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @atanf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @atanf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -880,7 +880,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.atan.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.atan.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -927,7 +927,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @sinhf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @sinhf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -974,7 +974,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.sinh.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.sinh.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -997,7 +997,7 @@ define <4 x float> @asinh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1015,7 +1015,7 @@ define <4 x float> @asinh_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
-; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1031,7 +1031,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @asinhf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @asinhf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1054,7 +1054,7 @@ define <4 x float> @int_asinh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1072,7 +1072,7 @@ define <4 x float> @int_asinh_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT]])
-; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1088,7 +1088,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.asinh.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.asinh.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1135,7 +1135,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @coshf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @coshf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1182,7 +1182,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.cosh.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.cosh.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1205,7 +1205,7 @@ define <4 x float> @acosh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1223,7 +1223,7 @@ define <4 x float> @acosh_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
-; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1239,7 +1239,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @acoshf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @acoshf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1262,7 +1262,7 @@ define <4 x float> @int_acosh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1280,7 +1280,7 @@ define <4 x float> @int_acosh_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT]])
-; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1296,7 +1296,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.acosh.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.acosh.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1343,7 +1343,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @tanhf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @tanhf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1390,7 +1390,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.tanh.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.tanh.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1413,7 +1413,7 @@ define <4 x float> @atanh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @atanhf(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @atanhf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1431,7 +1431,7 @@ define <4 x float> @atanh_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @atanhf(float [[VECEXT]])
-; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @atanhf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1447,7 +1447,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @atanhf(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @atanhf(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
@@ -1470,7 +1470,7 @@ define <4 x float> @int_atanh_4x(ptr %a) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atanh.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atanh.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1488,7 +1488,7 @@ define <4 x float> @int_atanh_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
 ; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atanh.f32(float [[VECEXT]])
-; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atanh.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
@@ -1504,7 +1504,7 @@ entry:
   %0 = load <4 x float>, ptr %a, align 16
   %vecext = extractelement <4 x float> %0, i32 0
   %1 = tail call fast float @llvm.atanh.f32(float %vecext)
-  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
   %vecext.1 = extractelement <4 x float> %0, i32 1
   %2 = tail call fast float @llvm.atanh.f32(float %vecext.1)
   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
index 2a3029b..dab37a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll
@@ -15,10 +15,10 @@
 define void @test(i1 %arg, ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[IF_END:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_END:%.*]], label [[FOR_COND_PREHEADER:%.*]]
 ; CHECK:       for.cond.preheader:
-; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 2
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 3
+; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds [100 x i32], ptr [[P:%.*]], i64 0, i64 2
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds [100 x i32], ptr [[P]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[I]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[TMP1]], 0